1 /* 2 * Copyright (c) 2014 Scott Mansell 3 * Copyright 2014 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 #include <inttypes.h> 26 #include "util/u_format.h" 27 #include "util/crc32.h" 28 #include "util/u_math.h" 29 #include "util/u_memory.h" 30 #include "util/ralloc.h" 31 #include "util/hash_table.h" 32 #include "tgsi/tgsi_dump.h" 33 #include "tgsi/tgsi_parse.h" 34 #include "compiler/nir/nir.h" 35 #include "compiler/nir/nir_builder.h" 36 #include "compiler/nir_types.h" 37 #include "nir/tgsi_to_nir.h" 38 #include "vc4_context.h" 39 #include "vc4_qpu.h" 40 #include "vc4_qir.h" 41 42 static struct qreg 43 ntq_get_src(struct vc4_compile *c, nir_src src, int i); 44 static void 45 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list); 46 47 static int 48 type_size(const struct glsl_type *type) 49 { 50 return glsl_count_attribute_slots(type, false); 51 } 52 53 static void 54 resize_qreg_array(struct vc4_compile *c, 55 struct qreg **regs, 56 uint32_t *size, 57 uint32_t decl_size) 58 { 59 if (*size >= decl_size) 60 return; 61 62 uint32_t old_size = *size; 63 *size = MAX2(*size * 2, decl_size); 64 *regs = reralloc(c, *regs, struct qreg, *size); 65 if (!*regs) { 66 fprintf(stderr, "Malloc failure\n"); 67 abort(); 68 } 69 70 for (uint32_t i = old_size; i < *size; i++) 71 (*regs)[i] = c->undef; 72 } 73 74 static void 75 ntq_emit_thrsw(struct vc4_compile *c) 76 { 77 if (!c->fs_threaded) 78 return; 79 80 /* Always thread switch after each texture operation for now. 81 * 82 * We could do better by batching a bunch of texture fetches up and 83 * then doing one thread switch and collecting all their results 84 * afterward. 85 */ 86 qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef, 87 c->undef, c->undef)); 88 c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); 89 } 90 91 static struct qreg 92 indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) 93 { 94 struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); 95 uint32_t offset = nir_intrinsic_base(intr); 96 struct vc4_compiler_ubo_range *range = NULL; 97 unsigned i; 98 for (i = 0; i < c->num_uniform_ranges; i++) { 99 range = &c->ubo_ranges[i]; 100 if (offset >= range->src_offset && 101 offset < range->src_offset + range->size) { 102 break; 103 } 104 } 105 /* The driver-location-based offset always has to be within a declared 106 * uniform range. 107 */ 108 assert(range); 109 if (!range->used) { 110 range->used = true; 111 range->dst_offset = c->next_ubo_dst_offset; 112 c->next_ubo_dst_offset += range->size; 113 c->num_ubo_ranges++; 114 } 115 116 offset -= range->src_offset; 117 118 /* Adjust for where we stored the TGSI register base. */ 119 indirect_offset = qir_ADD(c, indirect_offset, 120 qir_uniform_ui(c, (range->dst_offset + 121 offset))); 122 123 /* Clamp to [0, array size). Note that MIN/MAX are signed. */ 124 indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0)); 125 indirect_offset = qir_MIN_NOIMM(c, indirect_offset, 126 qir_uniform_ui(c, (range->dst_offset + 127 range->size - 4))); 128 129 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), 130 indirect_offset, 131 qir_uniform(c, QUNIFORM_UBO_ADDR, 0)); 132 133 c->num_texture_samples++; 134 135 ntq_emit_thrsw(c); 136 137 return qir_TEX_RESULT(c); 138 } 139 140 nir_ssa_def * 141 vc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) 142 { 143 switch (swiz) { 144 default: 145 case PIPE_SWIZZLE_NONE: 146 fprintf(stderr, "warning: unknown swizzle\n"); 147 /* FALLTHROUGH */ 148 case PIPE_SWIZZLE_0: 149 return nir_imm_float(b, 0.0); 150 case PIPE_SWIZZLE_1: 151 return nir_imm_float(b, 1.0); 152 case PIPE_SWIZZLE_X: 153 case PIPE_SWIZZLE_Y: 154 case PIPE_SWIZZLE_Z: 155 case PIPE_SWIZZLE_W: 156 return srcs[swiz]; 157 } 158 } 159 160 static struct qreg * 161 ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def) 162 { 163 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 164 def->num_components); 165 _mesa_hash_table_insert(c->def_ht, def, qregs); 166 return qregs; 167 } 168 169 /** 170 * This function is responsible for getting QIR results into the associated 171 * storage for a NIR instruction. 172 * 173 * If it's a NIR SSA def, then we just set the associated hash table entry to 174 * the new result. 175 * 176 * If it's a NIR reg, then we need to update the existing qreg assigned to the 177 * NIR destination with the incoming value. To do that without introducing 178 * new MOVs, we require that the incoming qreg either be a uniform, or be 179 * SSA-defined by the previous QIR instruction in the block and rewritable by 180 * this function. That lets us sneak ahead and insert the SF flag beforehand 181 * (knowing that the previous instruction doesn't depend on flags) and rewrite 182 * its destination to be the NIR reg's destination 183 */ 184 static void 185 ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan, 186 struct qreg result) 187 { 188 struct qinst *last_inst = NULL; 189 if (!list_empty(&c->cur_block->instructions)) 190 last_inst = (struct qinst *)c->cur_block->instructions.prev; 191 192 assert(result.file == QFILE_UNIF || 193 (result.file == QFILE_TEMP && 194 last_inst && last_inst == c->defs[result.index])); 195 196 if (dest->is_ssa) { 197 assert(chan < dest->ssa.num_components); 198 199 struct qreg *qregs; 200 struct hash_entry *entry = 201 _mesa_hash_table_search(c->def_ht, &dest->ssa); 202 203 if (entry) 204 qregs = entry->data; 205 else 206 qregs = ntq_init_ssa_def(c, &dest->ssa); 207 208 qregs[chan] = result; 209 } else { 210 nir_register *reg = dest->reg.reg; 211 assert(dest->reg.base_offset == 0); 212 assert(reg->num_array_elems == 0); 213 struct hash_entry *entry = 214 _mesa_hash_table_search(c->def_ht, reg); 215 struct qreg *qregs = entry->data; 216 217 /* Insert a MOV if the source wasn't an SSA def in the 218 * previous instruction. 219 */ 220 if (result.file == QFILE_UNIF) { 221 result = qir_MOV(c, result); 222 last_inst = c->defs[result.index]; 223 } 224 225 /* We know they're both temps, so just rewrite index. */ 226 c->defs[last_inst->dst.index] = NULL; 227 last_inst->dst.index = qregs[chan].index; 228 229 /* If we're in control flow, then make this update of the reg 230 * conditional on the execution mask. 231 */ 232 if (c->execute.file != QFILE_NULL) { 233 last_inst->dst.index = qregs[chan].index; 234 235 /* Set the flags to the current exec mask. To insert 236 * the SF, we temporarily remove our SSA instruction. 237 */ 238 list_del(&last_inst->link); 239 qir_SF(c, c->execute); 240 list_addtail(&last_inst->link, 241 &c->cur_block->instructions); 242 243 last_inst->cond = QPU_COND_ZS; 244 last_inst->cond_is_exec_mask = true; 245 } 246 } 247 } 248 249 static struct qreg * 250 ntq_get_dest(struct vc4_compile *c, nir_dest *dest) 251 { 252 if (dest->is_ssa) { 253 struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa); 254 for (int i = 0; i < dest->ssa.num_components; i++) 255 qregs[i] = c->undef; 256 return qregs; 257 } else { 258 nir_register *reg = dest->reg.reg; 259 assert(dest->reg.base_offset == 0); 260 assert(reg->num_array_elems == 0); 261 struct hash_entry *entry = 262 _mesa_hash_table_search(c->def_ht, reg); 263 return entry->data; 264 } 265 } 266 267 static struct qreg 268 ntq_get_src(struct vc4_compile *c, nir_src src, int i) 269 { 270 struct hash_entry *entry; 271 if (src.is_ssa) { 272 entry = _mesa_hash_table_search(c->def_ht, src.ssa); 273 assert(i < src.ssa->num_components); 274 } else { 275 nir_register *reg = src.reg.reg; 276 entry = _mesa_hash_table_search(c->def_ht, reg); 277 assert(reg->num_array_elems == 0); 278 assert(src.reg.base_offset == 0); 279 assert(i < reg->num_components); 280 } 281 282 struct qreg *qregs = entry->data; 283 return qregs[i]; 284 } 285 286 static struct qreg 287 ntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr, 288 unsigned src) 289 { 290 assert(util_is_power_of_two(instr->dest.write_mask)); 291 unsigned chan = ffs(instr->dest.write_mask) - 1; 292 struct qreg r = ntq_get_src(c, instr->src[src].src, 293 instr->src[src].swizzle[chan]); 294 295 assert(!instr->src[src].abs); 296 assert(!instr->src[src].negate); 297 298 return r; 299 }; 300 301 static inline struct qreg 302 qir_SAT(struct vc4_compile *c, struct qreg val) 303 { 304 return qir_FMAX(c, 305 qir_FMIN(c, val, qir_uniform_f(c, 1.0)), 306 qir_uniform_f(c, 0.0)); 307 } 308 309 static struct qreg 310 ntq_rcp(struct vc4_compile *c, struct qreg x) 311 { 312 struct qreg r = qir_RCP(c, x); 313 314 /* Apply a Newton-Raphson step to improve the accuracy. */ 315 r = qir_FMUL(c, r, qir_FSUB(c, 316 qir_uniform_f(c, 2.0), 317 qir_FMUL(c, x, r))); 318 319 return r; 320 } 321 322 static struct qreg 323 ntq_rsq(struct vc4_compile *c, struct qreg x) 324 { 325 struct qreg r = qir_RSQ(c, x); 326 327 /* Apply a Newton-Raphson step to improve the accuracy. */ 328 r = qir_FMUL(c, r, qir_FSUB(c, 329 qir_uniform_f(c, 1.5), 330 qir_FMUL(c, 331 qir_uniform_f(c, 0.5), 332 qir_FMUL(c, x, 333 qir_FMUL(c, r, r))))); 334 335 return r; 336 } 337 338 static struct qreg 339 ntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1) 340 { 341 struct qreg src0_hi = qir_SHR(c, src0, 342 qir_uniform_ui(c, 24)); 343 struct qreg src1_hi = qir_SHR(c, src1, 344 qir_uniform_ui(c, 24)); 345 346 struct qreg hilo = qir_MUL24(c, src0_hi, src1); 347 struct qreg lohi = qir_MUL24(c, src0, src1_hi); 348 struct qreg lolo = qir_MUL24(c, src0, src1); 349 350 return qir_ADD(c, lolo, qir_SHL(c, 351 qir_ADD(c, hilo, lohi), 352 qir_uniform_ui(c, 24))); 353 } 354 355 static struct qreg 356 ntq_scale_depth_texture(struct vc4_compile *c, struct qreg src) 357 { 358 struct qreg depthf = qir_ITOF(c, qir_SHR(c, src, 359 qir_uniform_ui(c, 8))); 360 return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff)); 361 } 362 363 /** 364 * Emits a lowered TXF_MS from an MSAA texture. 365 * 366 * The addressing math has been lowered in NIR, and now we just need to read 367 * it like a UBO. 368 */ 369 static void 370 ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr) 371 { 372 uint32_t tile_width = 32; 373 uint32_t tile_height = 32; 374 uint32_t tile_size = (tile_height * tile_width * 375 VC4_MAX_SAMPLES * sizeof(uint32_t)); 376 377 unsigned unit = instr->texture_index; 378 uint32_t w = align(c->key->tex[unit].msaa_width, tile_width); 379 uint32_t w_tiles = w / tile_width; 380 uint32_t h = align(c->key->tex[unit].msaa_height, tile_height); 381 uint32_t h_tiles = h / tile_height; 382 uint32_t size = w_tiles * h_tiles * tile_size; 383 384 struct qreg addr; 385 assert(instr->num_srcs == 1); 386 assert(instr->src[0].src_type == nir_tex_src_coord); 387 addr = ntq_get_src(c, instr->src[0].src, 0); 388 389 /* Perform the clamping required by kernel validation. */ 390 addr = qir_MAX(c, addr, qir_uniform_ui(c, 0)); 391 addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4)); 392 393 qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), 394 addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit)); 395 396 ntq_emit_thrsw(c); 397 398 struct qreg tex = qir_TEX_RESULT(c); 399 c->num_texture_samples++; 400 401 enum pipe_format format = c->key->tex[unit].format; 402 if (util_format_is_depth_or_stencil(format)) { 403 struct qreg scaled = ntq_scale_depth_texture(c, tex); 404 for (int i = 0; i < 4; i++) 405 ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled)); 406 } else { 407 for (int i = 0; i < 4; i++) 408 ntq_store_dest(c, &instr->dest, i, 409 qir_UNPACK_8_F(c, tex, i)); 410 } 411 } 412 413 static void 414 ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) 415 { 416 struct qreg s, t, r, lod, compare; 417 bool is_txb = false, is_txl = false; 418 unsigned unit = instr->texture_index; 419 420 if (instr->op == nir_texop_txf) { 421 ntq_emit_txf(c, instr); 422 return; 423 } 424 425 for (unsigned i = 0; i < instr->num_srcs; i++) { 426 switch (instr->src[i].src_type) { 427 case nir_tex_src_coord: 428 s = ntq_get_src(c, instr->src[i].src, 0); 429 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D) 430 t = qir_uniform_f(c, 0.5); 431 else 432 t = ntq_get_src(c, instr->src[i].src, 1); 433 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) 434 r = ntq_get_src(c, instr->src[i].src, 2); 435 break; 436 case nir_tex_src_bias: 437 lod = ntq_get_src(c, instr->src[i].src, 0); 438 is_txb = true; 439 break; 440 case nir_tex_src_lod: 441 lod = ntq_get_src(c, instr->src[i].src, 0); 442 is_txl = true; 443 break; 444 case nir_tex_src_comparator: 445 compare = ntq_get_src(c, instr->src[i].src, 0); 446 break; 447 default: 448 unreachable("unknown texture source"); 449 } 450 } 451 452 if (c->stage != QSTAGE_FRAG && !is_txl) { 453 /* From the GLSL 1.20 spec: 454 * 455 * "If it is mip-mapped and running on the vertex shader, 456 * then the base texture is used." 457 */ 458 is_txl = true; 459 lod = qir_uniform_ui(c, 0); 460 } 461 462 if (c->key->tex[unit].force_first_level) { 463 lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit); 464 is_txl = true; 465 is_txb = false; 466 } 467 468 struct qreg texture_u[] = { 469 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit), 470 qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit), 471 qir_uniform(c, QUNIFORM_CONSTANT, 0), 472 qir_uniform(c, QUNIFORM_CONSTANT, 0), 473 }; 474 uint32_t next_texture_u = 0; 475 476 /* There is no native support for GL texture rectangle coordinates, so 477 * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0, 478 * 1]). 479 */ 480 if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) { 481 s = qir_FMUL(c, s, 482 qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, unit)); 483 t = qir_FMUL(c, t, 484 qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, unit)); 485 } 486 487 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) { 488 texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2, 489 unit | (is_txl << 16)); 490 } 491 492 struct qinst *tmu; 493 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 494 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r); 495 tmu->src[qir_get_tex_uniform_src(tmu)] = 496 texture_u[next_texture_u++]; 497 } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 498 c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP || 499 c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 500 c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) { 501 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), 502 qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, 503 unit)); 504 tmu->src[qir_get_tex_uniform_src(tmu)] = 505 texture_u[next_texture_u++]; 506 } 507 508 if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) { 509 s = qir_SAT(c, s); 510 } 511 512 if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) { 513 t = qir_SAT(c, t); 514 } 515 516 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t); 517 tmu->src[qir_get_tex_uniform_src(tmu)] = 518 texture_u[next_texture_u++]; 519 520 if (is_txl || is_txb) { 521 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod); 522 tmu->src[qir_get_tex_uniform_src(tmu)] = 523 texture_u[next_texture_u++]; 524 } 525 526 tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s); 527 tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++]; 528 529 c->num_texture_samples++; 530 531 ntq_emit_thrsw(c); 532 533 struct qreg tex = qir_TEX_RESULT(c); 534 535 enum pipe_format format = c->key->tex[unit].format; 536 537 struct qreg *dest = ntq_get_dest(c, &instr->dest); 538 if (util_format_is_depth_or_stencil(format)) { 539 struct qreg normalized = ntq_scale_depth_texture(c, tex); 540 struct qreg depth_output; 541 542 struct qreg u0 = qir_uniform_f(c, 0.0f); 543 struct qreg u1 = qir_uniform_f(c, 1.0f); 544 if (c->key->tex[unit].compare_mode) { 545 /* From the GL_ARB_shadow spec: 546 * 547 * "Let Dt (D subscript t) be the depth texture 548 * value, in the range [0, 1]. Let R be the 549 * interpolated texture coordinate clamped to the 550 * range [0, 1]." 551 */ 552 compare = qir_SAT(c, compare); 553 554 switch (c->key->tex[unit].compare_func) { 555 case PIPE_FUNC_NEVER: 556 depth_output = qir_uniform_f(c, 0.0f); 557 break; 558 case PIPE_FUNC_ALWAYS: 559 depth_output = u1; 560 break; 561 case PIPE_FUNC_EQUAL: 562 qir_SF(c, qir_FSUB(c, compare, normalized)); 563 depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0); 564 break; 565 case PIPE_FUNC_NOTEQUAL: 566 qir_SF(c, qir_FSUB(c, compare, normalized)); 567 depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0); 568 break; 569 case PIPE_FUNC_GREATER: 570 qir_SF(c, qir_FSUB(c, compare, normalized)); 571 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0); 572 break; 573 case PIPE_FUNC_GEQUAL: 574 qir_SF(c, qir_FSUB(c, normalized, compare)); 575 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0); 576 break; 577 case PIPE_FUNC_LESS: 578 qir_SF(c, qir_FSUB(c, compare, normalized)); 579 depth_output = qir_SEL(c, QPU_COND_NS, u1, u0); 580 break; 581 case PIPE_FUNC_LEQUAL: 582 qir_SF(c, qir_FSUB(c, normalized, compare)); 583 depth_output = qir_SEL(c, QPU_COND_NC, u1, u0); 584 break; 585 } 586 } else { 587 depth_output = normalized; 588 } 589 590 for (int i = 0; i < 4; i++) 591 dest[i] = depth_output; 592 } else { 593 for (int i = 0; i < 4; i++) 594 dest[i] = qir_UNPACK_8_F(c, tex, i); 595 } 596 } 597 598 /** 599 * Computes x - floor(x), which is tricky because our FTOI truncates (rounds 600 * to zero). 601 */ 602 static struct qreg 603 ntq_ffract(struct vc4_compile *c, struct qreg src) 604 { 605 struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src)); 606 struct qreg diff = qir_FSUB(c, src, trunc); 607 qir_SF(c, diff); 608 609 qir_FADD_dest(c, diff, 610 diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; 611 612 return qir_MOV(c, diff); 613 } 614 615 /** 616 * Computes floor(x), which is tricky because our FTOI truncates (rounds to 617 * zero). 618 */ 619 static struct qreg 620 ntq_ffloor(struct vc4_compile *c, struct qreg src) 621 { 622 struct qreg result = qir_ITOF(c, qir_FTOI(c, src)); 623 624 /* This will be < 0 if we truncated and the truncation was of a value 625 * that was < 0 in the first place. 626 */ 627 qir_SF(c, qir_FSUB(c, src, result)); 628 629 struct qinst *sub = qir_FSUB_dest(c, result, 630 result, qir_uniform_f(c, 1.0)); 631 sub->cond = QPU_COND_NS; 632 633 return qir_MOV(c, result); 634 } 635 636 /** 637 * Computes ceil(x), which is tricky because our FTOI truncates (rounds to 638 * zero). 639 */ 640 static struct qreg 641 ntq_fceil(struct vc4_compile *c, struct qreg src) 642 { 643 struct qreg result = qir_ITOF(c, qir_FTOI(c, src)); 644 645 /* This will be < 0 if we truncated and the truncation was of a value 646 * that was > 0 in the first place. 647 */ 648 qir_SF(c, qir_FSUB(c, result, src)); 649 650 qir_FADD_dest(c, result, 651 result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; 652 653 return qir_MOV(c, result); 654 } 655 656 static struct qreg 657 ntq_fsin(struct vc4_compile *c, struct qreg src) 658 { 659 float coeff[] = { 660 -2.0 * M_PI, 661 pow(2.0 * M_PI, 3) / (3 * 2 * 1), 662 -pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1), 663 pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1), 664 -pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 665 }; 666 667 struct qreg scaled_x = 668 qir_FMUL(c, 669 src, 670 qir_uniform_f(c, 1.0 / (M_PI * 2.0))); 671 672 struct qreg x = qir_FADD(c, 673 ntq_ffract(c, scaled_x), 674 qir_uniform_f(c, -0.5)); 675 struct qreg x2 = qir_FMUL(c, x, x); 676 struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0])); 677 for (int i = 1; i < ARRAY_SIZE(coeff); i++) { 678 x = qir_FMUL(c, x, x2); 679 sum = qir_FADD(c, 680 sum, 681 qir_FMUL(c, 682 x, 683 qir_uniform_f(c, coeff[i]))); 684 } 685 return sum; 686 } 687 688 static struct qreg 689 ntq_fcos(struct vc4_compile *c, struct qreg src) 690 { 691 float coeff[] = { 692 -1.0f, 693 pow(2.0 * M_PI, 2) / (2 * 1), 694 -pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1), 695 pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1), 696 -pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 697 pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 698 }; 699 700 struct qreg scaled_x = 701 qir_FMUL(c, src, 702 qir_uniform_f(c, 1.0f / (M_PI * 2.0f))); 703 struct qreg x_frac = qir_FADD(c, 704 ntq_ffract(c, scaled_x), 705 qir_uniform_f(c, -0.5)); 706 707 struct qreg sum = qir_uniform_f(c, coeff[0]); 708 struct qreg x2 = qir_FMUL(c, x_frac, x_frac); 709 struct qreg x = x2; /* Current x^2, x^4, or x^6 */ 710 for (int i = 1; i < ARRAY_SIZE(coeff); i++) { 711 if (i != 1) 712 x = qir_FMUL(c, x, x2); 713 714 struct qreg mul = qir_FMUL(c, 715 x, 716 qir_uniform_f(c, coeff[i])); 717 if (i == 0) 718 sum = mul; 719 else 720 sum = qir_FADD(c, sum, mul); 721 } 722 return sum; 723 } 724 725 static struct qreg 726 ntq_fsign(struct vc4_compile *c, struct qreg src) 727 { 728 struct qreg t = qir_get_temp(c); 729 730 qir_SF(c, src); 731 qir_MOV_dest(c, t, qir_uniform_f(c, 0.0)); 732 qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC; 733 qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS; 734 return qir_MOV(c, t); 735 } 736 737 static void 738 emit_vertex_input(struct vc4_compile *c, int attr) 739 { 740 enum pipe_format format = c->vs_key->attr_formats[attr]; 741 uint32_t attr_size = util_format_get_blocksize(format); 742 743 c->vattr_sizes[attr] = align(attr_size, 4); 744 for (int i = 0; i < align(attr_size, 4) / 4; i++) { 745 c->inputs[attr * 4 + i] = 746 qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i)); 747 c->num_inputs++; 748 } 749 } 750 751 static void 752 emit_fragcoord_input(struct vc4_compile *c, int attr) 753 { 754 c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0)); 755 c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0)); 756 c->inputs[attr * 4 + 2] = 757 qir_FMUL(c, 758 qir_ITOF(c, qir_FRAG_Z(c)), 759 qir_uniform_f(c, 1.0 / 0xffffff)); 760 c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c)); 761 } 762 763 static struct qreg 764 emit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot, 765 uint8_t swizzle) 766 { 767 uint32_t i = c->num_input_slots++; 768 struct qreg vary = { 769 QFILE_VARY, 770 i 771 }; 772 773 if (c->num_input_slots >= c->input_slots_array_size) { 774 c->input_slots_array_size = 775 MAX2(4, c->input_slots_array_size * 2); 776 777 c->input_slots = reralloc(c, c->input_slots, 778 struct vc4_varying_slot, 779 c->input_slots_array_size); 780 } 781 782 c->input_slots[i].slot = slot; 783 c->input_slots[i].swizzle = swizzle; 784 785 return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c))); 786 } 787 788 static void 789 emit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot) 790 { 791 for (int i = 0; i < 4; i++) { 792 c->inputs[attr * 4 + i] = 793 emit_fragment_varying(c, slot, i); 794 c->num_inputs++; 795 } 796 } 797 798 static void 799 add_output(struct vc4_compile *c, 800 uint32_t decl_offset, 801 uint8_t slot, 802 uint8_t swizzle) 803 { 804 uint32_t old_array_size = c->outputs_array_size; 805 resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 806 decl_offset + 1); 807 808 if (old_array_size != c->outputs_array_size) { 809 c->output_slots = reralloc(c, 810 c->output_slots, 811 struct vc4_varying_slot, 812 c->outputs_array_size); 813 } 814 815 c->output_slots[decl_offset].slot = slot; 816 c->output_slots[decl_offset].swizzle = swizzle; 817 } 818 819 static void 820 declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size) 821 { 822 unsigned array_id = c->num_uniform_ranges++; 823 if (array_id >= c->ubo_ranges_array_size) { 824 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2, 825 array_id + 1); 826 c->ubo_ranges = reralloc(c, c->ubo_ranges, 827 struct vc4_compiler_ubo_range, 828 c->ubo_ranges_array_size); 829 } 830 831 c->ubo_ranges[array_id].dst_offset = 0; 832 c->ubo_ranges[array_id].src_offset = start; 833 c->ubo_ranges[array_id].size = size; 834 c->ubo_ranges[array_id].used = false; 835 } 836 837 static bool 838 ntq_src_is_only_ssa_def_user(nir_src *src) 839 { 840 if (!src->is_ssa) 841 return false; 842 843 if (!list_empty(&src->ssa->if_uses)) 844 return false; 845 846 return (src->ssa->uses.next == &src->use_link && 847 src->ssa->uses.next->next == &src->ssa->uses); 848 } 849 850 /** 851 * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack 852 * bit set. 853 * 854 * However, as an optimization, it tries to find the instructions generating 855 * the sources to be packed and just emit the pack flag there, if possible. 856 */ 857 static void 858 ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr) 859 { 860 struct qreg result = qir_get_temp(c); 861 struct nir_alu_instr *vec4 = NULL; 862 863 /* If packing from a vec4 op (as expected), identify it so that we can 864 * peek back at what generated its sources. 865 */ 866 if (instr->src[0].src.is_ssa && 867 instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu && 868 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op == 869 nir_op_vec4) { 870 vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 871 } 872 873 /* If the pack is replicating the same channel 4 times, use the 8888 874 * pack flag. This is common for blending using the alpha 875 * channel. 876 */ 877 if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] && 878 instr->src[0].swizzle[0] == instr->src[0].swizzle[2] && 879 instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) { 880 struct qreg rep = ntq_get_src(c, 881 instr->src[0].src, 882 instr->src[0].swizzle[0]); 883 ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep)); 884 return; 885 } 886 887 for (int i = 0; i < 4; i++) { 888 int swiz = instr->src[0].swizzle[i]; 889 struct qreg src; 890 if (vec4) { 891 src = ntq_get_src(c, vec4->src[swiz].src, 892 vec4->src[swiz].swizzle[0]); 893 } else { 894 src = ntq_get_src(c, instr->src[0].src, swiz); 895 } 896 897 if (vec4 && 898 ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) && 899 src.file == QFILE_TEMP && 900 c->defs[src.index] && 901 qir_is_mul(c->defs[src.index]) && 902 !c->defs[src.index]->dst.pack) { 903 struct qinst *rewrite = c->defs[src.index]; 904 c->defs[src.index] = NULL; 905 rewrite->dst = result; 906 rewrite->dst.pack = QPU_PACK_MUL_8A + i; 907 continue; 908 } 909 910 qir_PACK_8_F(c, result, src, i); 911 } 912 913 ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result)); 914 } 915 916 /** Handles sign-extended bitfield extracts for 16 bits. */ 917 static struct qreg 918 ntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset, 919 struct qreg bits) 920 { 921 assert(bits.file == QFILE_UNIF && 922 c->uniform_contents[bits.index] == QUNIFORM_CONSTANT && 923 c->uniform_data[bits.index] == 16); 924 925 assert(offset.file == QFILE_UNIF && 926 c->uniform_contents[offset.index] == QUNIFORM_CONSTANT); 927 int offset_bit = c->uniform_data[offset.index]; 928 assert(offset_bit % 16 == 0); 929 930 return qir_UNPACK_16_I(c, base, offset_bit / 16); 931 } 932 933 /** Handles unsigned bitfield extracts for 8 bits. */ 934 static struct qreg 935 ntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset, 936 struct qreg bits) 937 { 938 assert(bits.file == QFILE_UNIF && 939 c->uniform_contents[bits.index] == QUNIFORM_CONSTANT && 940 c->uniform_data[bits.index] == 8); 941 942 assert(offset.file == QFILE_UNIF && 943 c->uniform_contents[offset.index] == QUNIFORM_CONSTANT); 944 int offset_bit = c->uniform_data[offset.index]; 945 assert(offset_bit % 8 == 0); 946 947 return qir_UNPACK_8_I(c, base, offset_bit / 8); 948 } 949 950 /** 951 * If compare_instr is a valid comparison instruction, emits the 952 * compare_instr's comparison and returns the sel_instr's return value based 953 * on the compare_instr's result. 954 */ 955 static bool 956 ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest, 957 nir_alu_instr *compare_instr, 958 nir_alu_instr *sel_instr) 959 { 960 enum qpu_cond cond; 961 962 switch (compare_instr->op) { 963 case nir_op_feq: 964 case nir_op_ieq: 965 case nir_op_seq: 966 cond = QPU_COND_ZS; 967 break; 968 case nir_op_fne: 969 case nir_op_ine: 970 case nir_op_sne: 971 cond = QPU_COND_ZC; 972 break; 973 case nir_op_fge: 974 case nir_op_ige: 975 case nir_op_uge: 976 case nir_op_sge: 977 cond = QPU_COND_NC; 978 break; 979 case nir_op_flt: 980 case nir_op_ilt: 981 case nir_op_slt: 982 cond = QPU_COND_NS; 983 break; 984 default: 985 return false; 986 } 987 988 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 989 struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1); 990 991 unsigned unsized_type = 992 nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]); 993 if (unsized_type == nir_type_float) 994 qir_SF(c, qir_FSUB(c, src0, src1)); 995 else 996 qir_SF(c, qir_SUB(c, src0, src1)); 997 998 switch (sel_instr->op) { 999 case nir_op_seq: 1000 case nir_op_sne: 1001 case nir_op_sge: 1002 case nir_op_slt: 1003 *dest = qir_SEL(c, cond, 1004 qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0)); 1005 break; 1006 1007 case nir_op_bcsel: 1008 *dest = qir_SEL(c, cond, 1009 ntq_get_alu_src(c, sel_instr, 1), 1010 ntq_get_alu_src(c, sel_instr, 2)); 1011 break; 1012 1013 default: 1014 *dest = qir_SEL(c, cond, 1015 qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0)); 1016 break; 1017 } 1018 1019 /* Make the temporary for nir_store_dest(). */ 1020 *dest = qir_MOV(c, *dest); 1021 1022 return true; 1023 } 1024 1025 /** 1026 * Attempts to fold a comparison generating a boolean result into the 1027 * condition code for selecting between two values, instead of comparing the 1028 * boolean result against 0 to generate the condition code. 1029 */ 1030 static struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr, 1031 struct qreg *src) 1032 { 1033 if (!instr->src[0].src.is_ssa) 1034 goto out; 1035 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 1036 goto out; 1037 nir_alu_instr *compare = 1038 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 1039 if (!compare) 1040 goto out; 1041 1042 struct qreg dest; 1043 if (ntq_emit_comparison(c, &dest, compare, instr)) 1044 return dest; 1045 1046 out: 1047 qir_SF(c, src[0]); 1048 return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2])); 1049 } 1050 1051 static struct qreg 1052 ntq_fddx(struct vc4_compile *c, struct qreg src) 1053 { 1054 /* Make sure that we have a bare temp to use for MUL rotation, so it 1055 * can be allocated to an accumulator. 1056 */ 1057 if (src.pack || src.file != QFILE_TEMP) 1058 src = qir_MOV(c, src); 1059 1060 struct qreg from_left = qir_ROT_MUL(c, src, 1); 1061 struct qreg from_right = qir_ROT_MUL(c, src, 15); 1062 1063 /* Distinguish left/right pixels of the quad. */ 1064 qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0), 1065 qir_uniform_ui(c, 1))); 1066 1067 return qir_MOV(c, qir_SEL(c, QPU_COND_ZS, 1068 qir_FSUB(c, from_right, src), 1069 qir_FSUB(c, src, from_left))); 1070 } 1071 1072 static struct qreg 1073 ntq_fddy(struct vc4_compile *c, struct qreg src) 1074 { 1075 if (src.pack || src.file != QFILE_TEMP) 1076 src = qir_MOV(c, src); 1077 1078 struct qreg from_bottom = qir_ROT_MUL(c, src, 2); 1079 struct qreg from_top = qir_ROT_MUL(c, src, 14); 1080 1081 /* Distinguish top/bottom pixels of the quad. */ 1082 qir_SF(c, qir_AND(c, 1083 qir_reg(QFILE_QPU_ELEMENT, 0), 1084 qir_uniform_ui(c, 2))); 1085 1086 return qir_MOV(c, qir_SEL(c, QPU_COND_ZS, 1087 qir_FSUB(c, from_top, src), 1088 qir_FSUB(c, src, from_bottom))); 1089 } 1090 1091 static void 1092 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) 1093 { 1094 /* This should always be lowered to ALU operations for VC4. */ 1095 assert(!instr->dest.saturate); 1096 1097 /* Vectors are special in that they have non-scalarized writemasks, 1098 * and just take the first swizzle channel for each argument in order 1099 * into each writemask channel. 1100 */ 1101 if (instr->op == nir_op_vec2 || 1102 instr->op == nir_op_vec3 || 1103 instr->op == nir_op_vec4) { 1104 struct qreg srcs[4]; 1105 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1106 srcs[i] = ntq_get_src(c, instr->src[i].src, 1107 instr->src[i].swizzle[0]); 1108 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1109 ntq_store_dest(c, &instr->dest.dest, i, 1110 qir_MOV(c, srcs[i])); 1111 return; 1112 } 1113 1114 if (instr->op == nir_op_pack_unorm_4x8) { 1115 ntq_emit_pack_unorm_4x8(c, instr); 1116 return; 1117 } 1118 1119 if (instr->op == nir_op_unpack_unorm_4x8) { 1120 struct qreg src = ntq_get_src(c, instr->src[0].src, 1121 instr->src[0].swizzle[0]); 1122 for (int i = 0; i < 4; i++) { 1123 if (instr->dest.write_mask & (1 << i)) 1124 ntq_store_dest(c, &instr->dest.dest, i, 1125 qir_UNPACK_8_F(c, src, i)); 1126 } 1127 return; 1128 } 1129 1130 /* General case: We can just grab the one used channel per src. */ 1131 struct qreg src[nir_op_infos[instr->op].num_inputs]; 1132 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1133 src[i] = ntq_get_alu_src(c, instr, i); 1134 } 1135 1136 struct qreg result; 1137 1138 switch (instr->op) { 1139 case nir_op_fmov: 1140 case nir_op_imov: 1141 result = qir_MOV(c, src[0]); 1142 break; 1143 case nir_op_fmul: 1144 result = qir_FMUL(c, src[0], src[1]); 1145 break; 1146 case nir_op_fadd: 1147 result = qir_FADD(c, src[0], src[1]); 1148 break; 1149 case nir_op_fsub: 1150 result = qir_FSUB(c, src[0], src[1]); 1151 break; 1152 case nir_op_fmin: 1153 result = qir_FMIN(c, src[0], src[1]); 1154 break; 1155 case nir_op_fmax: 1156 result = qir_FMAX(c, src[0], src[1]); 1157 break; 1158 1159 case nir_op_f2i32: 1160 case nir_op_f2u32: 1161 result = qir_FTOI(c, src[0]); 1162 break; 1163 case nir_op_i2f32: 1164 case nir_op_u2f32: 1165 result = qir_ITOF(c, src[0]); 1166 break; 1167 case nir_op_b2f: 1168 result = qir_AND(c, src[0], qir_uniform_f(c, 1.0)); 1169 break; 1170 case nir_op_b2i: 1171 result = qir_AND(c, src[0], qir_uniform_ui(c, 1)); 1172 break; 1173 case nir_op_i2b: 1174 case nir_op_f2b: 1175 qir_SF(c, src[0]); 1176 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, 1177 qir_uniform_ui(c, ~0), 1178 qir_uniform_ui(c, 0))); 1179 break; 1180 1181 case nir_op_iadd: 1182 result = qir_ADD(c, src[0], src[1]); 1183 break; 1184 case nir_op_ushr: 1185 result = qir_SHR(c, src[0], src[1]); 1186 break; 1187 case nir_op_isub: 1188 result = qir_SUB(c, src[0], src[1]); 1189 break; 1190 case nir_op_ishr: 1191 result = qir_ASR(c, src[0], src[1]); 1192 break; 1193 case nir_op_ishl: 1194 result = qir_SHL(c, src[0], src[1]); 1195 break; 1196 case nir_op_imin: 1197 result = qir_MIN(c, src[0], src[1]); 1198 break; 1199 case nir_op_imax: 1200 result = qir_MAX(c, src[0], src[1]); 1201 break; 1202 case nir_op_iand: 1203 result = qir_AND(c, src[0], src[1]); 1204 break; 1205 case nir_op_ior: 1206 result = qir_OR(c, src[0], src[1]); 1207 break; 1208 case nir_op_ixor: 1209 result = qir_XOR(c, src[0], src[1]); 1210 break; 1211 case nir_op_inot: 1212 result = qir_NOT(c, src[0]); 1213 break; 1214 1215 case nir_op_imul: 1216 result = ntq_umul(c, src[0], src[1]); 1217 break; 1218 1219 case nir_op_seq: 1220 case nir_op_sne: 1221 case nir_op_sge: 1222 case nir_op_slt: 1223 case nir_op_feq: 1224 case nir_op_fne: 1225 case nir_op_fge: 1226 case nir_op_flt: 1227 case nir_op_ieq: 1228 case nir_op_ine: 1229 case nir_op_ige: 1230 case nir_op_uge: 1231 case nir_op_ilt: 1232 if (!ntq_emit_comparison(c, &result, instr, instr)) { 1233 fprintf(stderr, "Bad comparison instruction\n"); 1234 } 1235 break; 1236 1237 case nir_op_bcsel: 1238 result = ntq_emit_bcsel(c, instr, src); 1239 break; 1240 case nir_op_fcsel: 1241 qir_SF(c, src[0]); 1242 result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2])); 1243 break; 1244 1245 case nir_op_frcp: 1246 result = ntq_rcp(c, src[0]); 1247 break; 1248 case nir_op_frsq: 1249 result = ntq_rsq(c, src[0]); 1250 break; 1251 case nir_op_fexp2: 1252 result = qir_EXP2(c, src[0]); 1253 break; 1254 case nir_op_flog2: 1255 result = qir_LOG2(c, src[0]); 1256 break; 1257 1258 case nir_op_ftrunc: 1259 result = qir_ITOF(c, qir_FTOI(c, src[0])); 1260 break; 1261 case nir_op_fceil: 1262 result = ntq_fceil(c, src[0]); 1263 break; 1264 case nir_op_ffract: 1265 result = ntq_ffract(c, src[0]); 1266 break; 1267 case nir_op_ffloor: 1268 result = ntq_ffloor(c, src[0]); 1269 break; 1270 1271 case nir_op_fsin: 1272 result = ntq_fsin(c, src[0]); 1273 break; 1274 case nir_op_fcos: 1275 result = ntq_fcos(c, src[0]); 1276 break; 1277 1278 case nir_op_fsign: 1279 result = ntq_fsign(c, src[0]); 1280 break; 1281 1282 case nir_op_fabs: 1283 result = qir_FMAXABS(c, src[0], src[0]); 1284 break; 1285 case nir_op_iabs: 1286 result = qir_MAX(c, src[0], 1287 qir_SUB(c, qir_uniform_ui(c, 0), src[0])); 1288 break; 1289 1290 case nir_op_ibitfield_extract: 1291 result = ntq_emit_ibfe(c, src[0], src[1], src[2]); 1292 break; 1293 1294 case nir_op_ubitfield_extract: 1295 result = ntq_emit_ubfe(c, src[0], src[1], src[2]); 1296 break; 1297 1298 case nir_op_usadd_4x8: 1299 result = qir_V8ADDS(c, src[0], src[1]); 1300 break; 1301 1302 case nir_op_ussub_4x8: 1303 result = qir_V8SUBS(c, src[0], src[1]); 1304 break; 1305 1306 case nir_op_umin_4x8: 1307 result = qir_V8MIN(c, src[0], src[1]); 1308 break; 1309 1310 case nir_op_umax_4x8: 1311 result = qir_V8MAX(c, src[0], src[1]); 1312 break; 1313 1314 case nir_op_umul_unorm_4x8: 1315 result = qir_V8MULD(c, src[0], src[1]); 1316 break; 1317 1318 case nir_op_fddx: 1319 case nir_op_fddx_coarse: 1320 case nir_op_fddx_fine: 1321 result = ntq_fddx(c, src[0]); 1322 break; 1323 1324 case nir_op_fddy: 1325 case nir_op_fddy_coarse: 1326 case nir_op_fddy_fine: 1327 result = ntq_fddy(c, src[0]); 1328 break; 1329 1330 default: 1331 fprintf(stderr, "unknown NIR ALU inst: "); 1332 nir_print_instr(&instr->instr, stderr); 1333 fprintf(stderr, "\n"); 1334 abort(); 1335 } 1336 1337 /* We have a scalar result, so the instruction should only have a 1338 * single channel written to. 1339 */ 1340 assert(util_is_power_of_two(instr->dest.write_mask)); 1341 ntq_store_dest(c, &instr->dest.dest, 1342 ffs(instr->dest.write_mask) - 1, result); 1343 } 1344 1345 static void 1346 emit_frag_end(struct vc4_compile *c) 1347 { 1348 struct qreg color; 1349 if (c->output_color_index != -1) { 1350 color = c->outputs[c->output_color_index]; 1351 } else { 1352 color = qir_uniform_ui(c, 0); 1353 } 1354 1355 uint32_t discard_cond = QPU_COND_ALWAYS; 1356 if (c->s->info.fs.uses_discard) { 1357 qir_SF(c, c->discard); 1358 discard_cond = QPU_COND_ZS; 1359 } 1360 1361 if (c->fs_key->stencil_enabled) { 1362 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 1363 qir_uniform(c, QUNIFORM_STENCIL, 0)); 1364 if (c->fs_key->stencil_twoside) { 1365 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 1366 qir_uniform(c, QUNIFORM_STENCIL, 1)); 1367 } 1368 if (c->fs_key->stencil_full_writemasks) { 1369 qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 1370 qir_uniform(c, QUNIFORM_STENCIL, 2)); 1371 } 1372 } 1373 1374 if (c->output_sample_mask_index != -1) { 1375 qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]); 1376 } 1377 1378 if (c->fs_key->depth_enabled) { 1379 if (c->output_position_index != -1) { 1380 qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0), 1381 qir_FMUL(c, 1382 c->outputs[c->output_position_index], 1383 qir_uniform_f(c, 0xffffff)))->cond = discard_cond; 1384 } else { 1385 qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0), 1386 qir_FRAG_Z(c))->cond = discard_cond; 1387 } 1388 } 1389 1390 if (!c->msaa_per_sample_output) { 1391 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0), 1392 color)->cond = discard_cond; 1393 } else { 1394 for (int i = 0; i < VC4_MAX_SAMPLES; i++) { 1395 qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0), 1396 c->sample_colors[i])->cond = discard_cond; 1397 } 1398 } 1399 } 1400 1401 static void 1402 emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w) 1403 { 1404 struct qreg packed = qir_get_temp(c); 1405 1406 for (int i = 0; i < 2; i++) { 1407 struct qreg scale = 1408 qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0); 1409 1410 struct qreg packed_chan = packed; 1411 packed_chan.pack = QPU_PACK_A_16A + i; 1412 1413 qir_FTOI_dest(c, packed_chan, 1414 qir_FMUL(c, 1415 qir_FMUL(c, 1416 c->outputs[c->output_position_index + i], 1417 scale), 1418 rcp_w)); 1419 } 1420 1421 qir_VPM_WRITE(c, packed); 1422 } 1423 1424 static void 1425 emit_zs_write(struct vc4_compile *c, struct qreg rcp_w) 1426 { 1427 struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0); 1428 struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0); 1429 1430 qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c, 1431 c->outputs[c->output_position_index + 2], 1432 zscale), 1433 rcp_w), 1434 zoffset)); 1435 } 1436 1437 static void 1438 emit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w) 1439 { 1440 qir_VPM_WRITE(c, rcp_w); 1441 } 1442 1443 static void 1444 emit_point_size_write(struct vc4_compile *c) 1445 { 1446 struct qreg point_size; 1447 1448 if (c->output_point_size_index != -1) 1449 point_size = c->outputs[c->output_point_size_index]; 1450 else 1451 point_size = qir_uniform_f(c, 1.0); 1452 1453 /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835, 1454 * BCM21553). 1455 */ 1456 point_size = qir_FMAX(c, point_size, qir_uniform_f(c, .125)); 1457 1458 qir_VPM_WRITE(c, point_size); 1459 } 1460 1461 /** 1462 * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c. 1463 * 1464 * The simulator insists that there be at least one vertex attribute, so 1465 * vc4_draw.c will emit one if it wouldn't have otherwise. The simulator also 1466 * insists that all vertex attributes loaded get read by the VS/CS, so we have 1467 * to consume it here. 1468 */ 1469 static void 1470 emit_stub_vpm_read(struct vc4_compile *c) 1471 { 1472 if (c->num_inputs) 1473 return; 1474 1475 c->vattr_sizes[0] = 4; 1476 (void)qir_MOV(c, qir_reg(QFILE_VPM, 0)); 1477 c->num_inputs++; 1478 } 1479 1480 static void 1481 emit_vert_end(struct vc4_compile *c, 1482 struct vc4_varying_slot *fs_inputs, 1483 uint32_t num_fs_inputs) 1484 { 1485 struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]); 1486 1487 emit_stub_vpm_read(c); 1488 1489 emit_scaled_viewport_write(c, rcp_w); 1490 emit_zs_write(c, rcp_w); 1491 emit_rcp_wc_write(c, rcp_w); 1492 if (c->vs_key->per_vertex_point_size) 1493 emit_point_size_write(c); 1494 1495 for (int i = 0; i < num_fs_inputs; i++) { 1496 struct vc4_varying_slot *input = &fs_inputs[i]; 1497 int j; 1498 1499 for (j = 0; j < c->num_outputs; j++) { 1500 struct vc4_varying_slot *output = 1501 &c->output_slots[j]; 1502 1503 if (input->slot == output->slot && 1504 input->swizzle == output->swizzle) { 1505 qir_VPM_WRITE(c, c->outputs[j]); 1506 break; 1507 } 1508 } 1509 /* Emit padding if we didn't find a declared VS output for 1510 * this FS input. 1511 */ 1512 if (j == c->num_outputs) 1513 qir_VPM_WRITE(c, qir_uniform_f(c, 0.0)); 1514 } 1515 } 1516 1517 static void 1518 emit_coord_end(struct vc4_compile *c) 1519 { 1520 struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]); 1521 1522 emit_stub_vpm_read(c); 1523 1524 for (int i = 0; i < 4; i++) 1525 qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]); 1526 1527 emit_scaled_viewport_write(c, rcp_w); 1528 emit_zs_write(c, rcp_w); 1529 emit_rcp_wc_write(c, rcp_w); 1530 if (c->vs_key->per_vertex_point_size) 1531 emit_point_size_write(c); 1532 } 1533 1534 static void 1535 vc4_optimize_nir(struct nir_shader *s) 1536 { 1537 bool progress; 1538 1539 do { 1540 progress = false; 1541 1542 NIR_PASS_V(s, nir_lower_vars_to_ssa); 1543 NIR_PASS(progress, s, nir_lower_alu_to_scalar); 1544 NIR_PASS(progress, s, nir_lower_phis_to_scalar); 1545 NIR_PASS(progress, s, nir_copy_prop); 1546 NIR_PASS(progress, s, nir_opt_remove_phis); 1547 NIR_PASS(progress, s, nir_opt_dce); 1548 NIR_PASS(progress, s, nir_opt_dead_cf); 1549 NIR_PASS(progress, s, nir_opt_cse); 1550 NIR_PASS(progress, s, nir_opt_peephole_select, 8); 1551 NIR_PASS(progress, s, nir_opt_algebraic); 1552 NIR_PASS(progress, s, nir_opt_constant_folding); 1553 NIR_PASS(progress, s, nir_opt_undef); 1554 NIR_PASS(progress, s, nir_opt_loop_unroll, 1555 nir_var_shader_in | 1556 nir_var_shader_out | 1557 nir_var_local); 1558 } while (progress); 1559 } 1560 1561 static int 1562 driver_location_compare(const void *in_a, const void *in_b) 1563 { 1564 const nir_variable *const *a = in_a; 1565 const nir_variable *const *b = in_b; 1566 1567 return (*a)->data.driver_location - (*b)->data.driver_location; 1568 } 1569 1570 static void 1571 ntq_setup_inputs(struct vc4_compile *c) 1572 { 1573 unsigned num_entries = 0; 1574 nir_foreach_variable(var, &c->s->inputs) 1575 num_entries++; 1576 1577 nir_variable *vars[num_entries]; 1578 1579 unsigned i = 0; 1580 nir_foreach_variable(var, &c->s->inputs) 1581 vars[i++] = var; 1582 1583 /* Sort the variables so that we emit the input setup in 1584 * driver_location order. This is required for VPM reads, whose data 1585 * is fetched into the VPM in driver_location (TGSI register index) 1586 * order. 1587 */ 1588 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); 1589 1590 for (unsigned i = 0; i < num_entries; i++) { 1591 nir_variable *var = vars[i]; 1592 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1593 unsigned loc = var->data.driver_location; 1594 1595 assert(array_len == 1); 1596 (void)array_len; 1597 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 1598 (loc + 1) * 4); 1599 1600 if (c->stage == QSTAGE_FRAG) { 1601 if (var->data.location == VARYING_SLOT_POS) { 1602 emit_fragcoord_input(c, loc); 1603 } else if (var->data.location == VARYING_SLOT_PNTC || 1604 (var->data.location >= VARYING_SLOT_VAR0 && 1605 (c->fs_key->point_sprite_mask & 1606 (1 << (var->data.location - 1607 VARYING_SLOT_VAR0))))) { 1608 c->inputs[loc * 4 + 0] = c->point_x; 1609 c->inputs[loc * 4 + 1] = c->point_y; 1610 } else { 1611 emit_fragment_input(c, loc, var->data.location); 1612 } 1613 } else { 1614 emit_vertex_input(c, loc); 1615 } 1616 } 1617 } 1618 1619 static void 1620 ntq_setup_outputs(struct vc4_compile *c) 1621 { 1622 nir_foreach_variable(var, &c->s->outputs) { 1623 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1624 unsigned loc = var->data.driver_location * 4; 1625 1626 assert(array_len == 1); 1627 (void)array_len; 1628 1629 for (int i = 0; i < 4; i++) 1630 add_output(c, loc + i, var->data.location, i); 1631 1632 if (c->stage == QSTAGE_FRAG) { 1633 switch (var->data.location) { 1634 case FRAG_RESULT_COLOR: 1635 case FRAG_RESULT_DATA0: 1636 c->output_color_index = loc; 1637 break; 1638 case FRAG_RESULT_DEPTH: 1639 c->output_position_index = loc; 1640 break; 1641 case FRAG_RESULT_SAMPLE_MASK: 1642 c->output_sample_mask_index = loc; 1643 break; 1644 } 1645 } else { 1646 switch (var->data.location) { 1647 case VARYING_SLOT_POS: 1648 c->output_position_index = loc; 1649 break; 1650 case VARYING_SLOT_PSIZ: 1651 c->output_point_size_index = loc; 1652 break; 1653 } 1654 } 1655 } 1656 } 1657 1658 static void 1659 ntq_setup_uniforms(struct vc4_compile *c) 1660 { 1661 nir_foreach_variable(var, &c->s->uniforms) { 1662 uint32_t vec4_count = type_size(var->type); 1663 unsigned vec4_size = 4 * sizeof(float); 1664 1665 declare_uniform_range(c, var->data.driver_location * vec4_size, 1666 vec4_count * vec4_size); 1667 1668 } 1669 } 1670 1671 /** 1672 * Sets up the mapping from nir_register to struct qreg *. 1673 * 1674 * Each nir_register gets a struct qreg per 32-bit component being stored. 1675 */ 1676 static void 1677 ntq_setup_registers(struct vc4_compile *c, struct exec_list *list) 1678 { 1679 foreach_list_typed(nir_register, nir_reg, node, list) { 1680 unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 1681 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 1682 array_len * 1683 nir_reg->num_components); 1684 1685 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 1686 1687 for (int i = 0; i < array_len * nir_reg->num_components; i++) 1688 qregs[i] = qir_get_temp(c); 1689 } 1690 } 1691 1692 static void 1693 ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr) 1694 { 1695 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1696 for (int i = 0; i < instr->def.num_components; i++) 1697 qregs[i] = qir_uniform_ui(c, instr->value.u32[i]); 1698 1699 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 1700 } 1701 1702 static void 1703 ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr) 1704 { 1705 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1706 1707 /* QIR needs there to be *some* value, so pick 0 (same as for 1708 * ntq_setup_registers(). 1709 */ 1710 for (int i = 0; i < instr->def.num_components; i++) 1711 qregs[i] = qir_uniform_ui(c, 0); 1712 } 1713 1714 static void 1715 ntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr) 1716 { 1717 assert(nir_src_as_const_value(instr->src[0])->u32[0] == 0); 1718 1719 /* Reads of the per-sample color need to be done in 1720 * order. 1721 */ 1722 int sample_index = (nir_intrinsic_base(instr) - 1723 VC4_NIR_TLB_COLOR_READ_INPUT); 1724 for (int i = 0; i <= sample_index; i++) { 1725 if (c->color_reads[i].file == QFILE_NULL) { 1726 c->color_reads[i] = 1727 qir_TLB_COLOR_READ(c); 1728 } 1729 } 1730 ntq_store_dest(c, &instr->dest, 0, 1731 qir_MOV(c, c->color_reads[sample_index])); 1732 } 1733 1734 static void 1735 ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr) 1736 { 1737 assert(instr->num_components == 1); 1738 1739 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 1740 assert(const_offset && "vc4 doesn't support indirect inputs"); 1741 1742 if (c->stage == QSTAGE_FRAG && 1743 nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) { 1744 ntq_emit_color_read(c, instr); 1745 return; 1746 } 1747 1748 uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0]; 1749 int comp = nir_intrinsic_component(instr); 1750 ntq_store_dest(c, &instr->dest, 0, 1751 qir_MOV(c, c->inputs[offset * 4 + comp])); 1752 } 1753 1754 static void 1755 ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) 1756 { 1757 nir_const_value *const_offset; 1758 unsigned offset; 1759 1760 switch (instr->intrinsic) { 1761 case nir_intrinsic_load_uniform: 1762 assert(instr->num_components == 1); 1763 const_offset = nir_src_as_const_value(instr->src[0]); 1764 if (const_offset) { 1765 offset = nir_intrinsic_base(instr) + const_offset->u32[0]; 1766 assert(offset % 4 == 0); 1767 /* We need dwords */ 1768 offset = offset / 4; 1769 ntq_store_dest(c, &instr->dest, 0, 1770 qir_uniform(c, QUNIFORM_UNIFORM, 1771 offset)); 1772 } else { 1773 ntq_store_dest(c, &instr->dest, 0, 1774 indirect_uniform_load(c, instr)); 1775 } 1776 break; 1777 1778 case nir_intrinsic_load_user_clip_plane: 1779 for (int i = 0; i < instr->num_components; i++) { 1780 ntq_store_dest(c, &instr->dest, i, 1781 qir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 1782 nir_intrinsic_ucp_id(instr) * 1783 4 + i)); 1784 } 1785 break; 1786 1787 case nir_intrinsic_load_blend_const_color_r_float: 1788 case nir_intrinsic_load_blend_const_color_g_float: 1789 case nir_intrinsic_load_blend_const_color_b_float: 1790 case nir_intrinsic_load_blend_const_color_a_float: 1791 ntq_store_dest(c, &instr->dest, 0, 1792 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X + 1793 (instr->intrinsic - 1794 nir_intrinsic_load_blend_const_color_r_float), 1795 0)); 1796 break; 1797 1798 case nir_intrinsic_load_blend_const_color_rgba8888_unorm: 1799 ntq_store_dest(c, &instr->dest, 0, 1800 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA, 1801 0)); 1802 break; 1803 1804 case nir_intrinsic_load_blend_const_color_aaaa8888_unorm: 1805 ntq_store_dest(c, &instr->dest, 0, 1806 qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA, 1807 0)); 1808 break; 1809 1810 case nir_intrinsic_load_alpha_ref_float: 1811 ntq_store_dest(c, &instr->dest, 0, 1812 qir_uniform(c, QUNIFORM_ALPHA_REF, 0)); 1813 break; 1814 1815 case nir_intrinsic_load_sample_mask_in: 1816 ntq_store_dest(c, &instr->dest, 0, 1817 qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0)); 1818 break; 1819 1820 case nir_intrinsic_load_front_face: 1821 /* The register contains 0 (front) or 1 (back), and we need to 1822 * turn it into a NIR bool where true means front. 1823 */ 1824 ntq_store_dest(c, &instr->dest, 0, 1825 qir_ADD(c, 1826 qir_uniform_ui(c, -1), 1827 qir_reg(QFILE_FRAG_REV_FLAG, 0))); 1828 break; 1829 1830 case nir_intrinsic_load_input: 1831 ntq_emit_load_input(c, instr); 1832 break; 1833 1834 case nir_intrinsic_store_output: 1835 const_offset = nir_src_as_const_value(instr->src[1]); 1836 assert(const_offset && "vc4 doesn't support indirect outputs"); 1837 offset = nir_intrinsic_base(instr) + const_offset->u32[0]; 1838 1839 /* MSAA color outputs are the only case where we have an 1840 * output that's not lowered to being a store of a single 32 1841 * bit value. 1842 */ 1843 if (c->stage == QSTAGE_FRAG && instr->num_components == 4) { 1844 assert(offset == c->output_color_index); 1845 for (int i = 0; i < 4; i++) { 1846 c->sample_colors[i] = 1847 qir_MOV(c, ntq_get_src(c, instr->src[0], 1848 i)); 1849 } 1850 } else { 1851 offset = offset * 4 + nir_intrinsic_component(instr); 1852 assert(instr->num_components == 1); 1853 c->outputs[offset] = 1854 qir_MOV(c, ntq_get_src(c, instr->src[0], 0)); 1855 c->num_outputs = MAX2(c->num_outputs, offset + 1); 1856 } 1857 break; 1858 1859 case nir_intrinsic_discard: 1860 if (c->execute.file != QFILE_NULL) { 1861 qir_SF(c, c->execute); 1862 qir_MOV_cond(c, QPU_COND_ZS, c->discard, 1863 qir_uniform_ui(c, ~0)); 1864 } else { 1865 qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0)); 1866 } 1867 break; 1868 1869 case nir_intrinsic_discard_if: { 1870 /* true (~0) if we're discarding */ 1871 struct qreg cond = ntq_get_src(c, instr->src[0], 0); 1872 1873 if (c->execute.file != QFILE_NULL) { 1874 /* execute == 0 means the channel is active. Invert 1875 * the condition so that we can use zero as "executing 1876 * and discarding." 1877 */ 1878 qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond))); 1879 qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond); 1880 } else { 1881 qir_OR_dest(c, c->discard, c->discard, 1882 ntq_get_src(c, instr->src[0], 0)); 1883 } 1884 1885 break; 1886 } 1887 1888 default: 1889 fprintf(stderr, "Unknown intrinsic: "); 1890 nir_print_instr(&instr->instr, stderr); 1891 fprintf(stderr, "\n"); 1892 break; 1893 } 1894 } 1895 1896 /* Clears (activates) the execute flags for any channels whose jump target 1897 * matches this block. 1898 */ 1899 static void 1900 ntq_activate_execute_for_block(struct vc4_compile *c) 1901 { 1902 qir_SF(c, qir_SUB(c, 1903 c->execute, 1904 qir_uniform_ui(c, c->cur_block->index))); 1905 qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0)); 1906 } 1907 1908 static void 1909 ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt) 1910 { 1911 if (!c->vc4->screen->has_control_flow) { 1912 fprintf(stderr, 1913 "IF statement support requires updated kernel.\n"); 1914 return; 1915 } 1916 1917 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 1918 bool empty_else_block = 1919 (nir_else_block == nir_if_last_else_block(if_stmt) && 1920 exec_list_is_empty(&nir_else_block->instr_list)); 1921 1922 struct qblock *then_block = qir_new_block(c); 1923 struct qblock *after_block = qir_new_block(c); 1924 struct qblock *else_block; 1925 if (empty_else_block) 1926 else_block = after_block; 1927 else 1928 else_block = qir_new_block(c); 1929 1930 bool was_top_level = false; 1931 if (c->execute.file == QFILE_NULL) { 1932 c->execute = qir_MOV(c, qir_uniform_ui(c, 0)); 1933 was_top_level = true; 1934 } 1935 1936 /* Set ZS for executing (execute == 0) and jumping (if->condition == 1937 * 0) channels, and then update execute flags for those to point to 1938 * the ELSE block. 1939 */ 1940 qir_SF(c, qir_OR(c, 1941 c->execute, 1942 ntq_get_src(c, if_stmt->condition, 0))); 1943 qir_MOV_cond(c, QPU_COND_ZS, c->execute, 1944 qir_uniform_ui(c, else_block->index)); 1945 1946 /* Jump to ELSE if nothing is active for THEN, otherwise fall 1947 * through. 1948 */ 1949 qir_SF(c, c->execute); 1950 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC); 1951 qir_link_blocks(c->cur_block, else_block); 1952 qir_link_blocks(c->cur_block, then_block); 1953 1954 /* Process the THEN block. */ 1955 qir_set_emit_block(c, then_block); 1956 ntq_emit_cf_list(c, &if_stmt->then_list); 1957 1958 if (!empty_else_block) { 1959 /* Handle the end of the THEN block. First, all currently 1960 * active channels update their execute flags to point to 1961 * ENDIF 1962 */ 1963 qir_SF(c, c->execute); 1964 qir_MOV_cond(c, QPU_COND_ZS, c->execute, 1965 qir_uniform_ui(c, after_block->index)); 1966 1967 /* If everything points at ENDIF, then jump there immediately. */ 1968 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index))); 1969 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS); 1970 qir_link_blocks(c->cur_block, after_block); 1971 qir_link_blocks(c->cur_block, else_block); 1972 1973 qir_set_emit_block(c, else_block); 1974 ntq_activate_execute_for_block(c); 1975 ntq_emit_cf_list(c, &if_stmt->else_list); 1976 } 1977 1978 qir_link_blocks(c->cur_block, after_block); 1979 1980 qir_set_emit_block(c, after_block); 1981 if (was_top_level) { 1982 c->execute = c->undef; 1983 c->last_top_block = c->cur_block; 1984 } else { 1985 ntq_activate_execute_for_block(c); 1986 } 1987 } 1988 1989 static void 1990 ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump) 1991 { 1992 struct qblock *jump_block; 1993 switch (jump->type) { 1994 case nir_jump_break: 1995 jump_block = c->loop_break_block; 1996 break; 1997 case nir_jump_continue: 1998 jump_block = c->loop_cont_block; 1999 break; 2000 default: 2001 unreachable("Unsupported jump type\n"); 2002 } 2003 2004 qir_SF(c, c->execute); 2005 qir_MOV_cond(c, QPU_COND_ZS, c->execute, 2006 qir_uniform_ui(c, jump_block->index)); 2007 2008 /* Jump to the destination block if everyone has taken the jump. */ 2009 qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index))); 2010 qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS); 2011 struct qblock *new_block = qir_new_block(c); 2012 qir_link_blocks(c->cur_block, jump_block); 2013 qir_link_blocks(c->cur_block, new_block); 2014 qir_set_emit_block(c, new_block); 2015 } 2016 2017 static void 2018 ntq_emit_instr(struct vc4_compile *c, nir_instr *instr) 2019 { 2020 switch (instr->type) { 2021 case nir_instr_type_alu: 2022 ntq_emit_alu(c, nir_instr_as_alu(instr)); 2023 break; 2024 2025 case nir_instr_type_intrinsic: 2026 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 2027 break; 2028 2029 case nir_instr_type_load_const: 2030 ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 2031 break; 2032 2033 case nir_instr_type_ssa_undef: 2034 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); 2035 break; 2036 2037 case nir_instr_type_tex: 2038 ntq_emit_tex(c, nir_instr_as_tex(instr)); 2039 break; 2040 2041 case nir_instr_type_jump: 2042 ntq_emit_jump(c, nir_instr_as_jump(instr)); 2043 break; 2044 2045 default: 2046 fprintf(stderr, "Unknown NIR instr type: "); 2047 nir_print_instr(instr, stderr); 2048 fprintf(stderr, "\n"); 2049 abort(); 2050 } 2051 } 2052 2053 static void 2054 ntq_emit_block(struct vc4_compile *c, nir_block *block) 2055 { 2056 nir_foreach_instr(instr, block) { 2057 ntq_emit_instr(c, instr); 2058 } 2059 } 2060 2061 static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list); 2062 2063 static void 2064 ntq_emit_loop(struct vc4_compile *c, nir_loop *loop) 2065 { 2066 if (!c->vc4->screen->has_control_flow) { 2067 fprintf(stderr, 2068 "loop support requires updated kernel.\n"); 2069 ntq_emit_cf_list(c, &loop->body); 2070 return; 2071 } 2072 2073 bool was_top_level = false; 2074 if (c->execute.file == QFILE_NULL) { 2075 c->execute = qir_MOV(c, qir_uniform_ui(c, 0)); 2076 was_top_level = true; 2077 } 2078 2079 struct qblock *save_loop_cont_block = c->loop_cont_block; 2080 struct qblock *save_loop_break_block = c->loop_break_block; 2081 2082 c->loop_cont_block = qir_new_block(c); 2083 c->loop_break_block = qir_new_block(c); 2084 2085 qir_link_blocks(c->cur_block, c->loop_cont_block); 2086 qir_set_emit_block(c, c->loop_cont_block); 2087 ntq_activate_execute_for_block(c); 2088 2089 ntq_emit_cf_list(c, &loop->body); 2090 2091 /* If anything had explicitly continued, or is here at the end of the 2092 * loop, then we need to loop again. SF updates are masked by the 2093 * instruction's condition, so we can do the OR of the two conditions 2094 * within SF. 2095 */ 2096 qir_SF(c, c->execute); 2097 struct qinst *cont_check = 2098 qir_SUB_dest(c, 2099 c->undef, 2100 c->execute, 2101 qir_uniform_ui(c, c->loop_cont_block->index)); 2102 cont_check->cond = QPU_COND_ZC; 2103 cont_check->sf = true; 2104 2105 qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS); 2106 qir_link_blocks(c->cur_block, c->loop_cont_block); 2107 qir_link_blocks(c->cur_block, c->loop_break_block); 2108 2109 qir_set_emit_block(c, c->loop_break_block); 2110 if (was_top_level) { 2111 c->execute = c->undef; 2112 c->last_top_block = c->cur_block; 2113 } else { 2114 ntq_activate_execute_for_block(c); 2115 } 2116 2117 c->loop_break_block = save_loop_break_block; 2118 c->loop_cont_block = save_loop_cont_block; 2119 } 2120 2121 static void 2122 ntq_emit_function(struct vc4_compile *c, nir_function_impl *func) 2123 { 2124 fprintf(stderr, "FUNCTIONS not handled.\n"); 2125 abort(); 2126 } 2127 2128 static void 2129 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list) 2130 { 2131 foreach_list_typed(nir_cf_node, node, node, list) { 2132 switch (node->type) { 2133 case nir_cf_node_block: 2134 ntq_emit_block(c, nir_cf_node_as_block(node)); 2135 break; 2136 2137 case nir_cf_node_if: 2138 ntq_emit_if(c, nir_cf_node_as_if(node)); 2139 break; 2140 2141 case nir_cf_node_loop: 2142 ntq_emit_loop(c, nir_cf_node_as_loop(node)); 2143 break; 2144 2145 case nir_cf_node_function: 2146 ntq_emit_function(c, nir_cf_node_as_function(node)); 2147 break; 2148 2149 default: 2150 fprintf(stderr, "Unknown NIR node type\n"); 2151 abort(); 2152 } 2153 } 2154 } 2155 2156 static void 2157 ntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl) 2158 { 2159 ntq_setup_registers(c, &impl->registers); 2160 ntq_emit_cf_list(c, &impl->body); 2161 } 2162 2163 static void 2164 nir_to_qir(struct vc4_compile *c) 2165 { 2166 if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard) 2167 c->discard = qir_MOV(c, qir_uniform_ui(c, 0)); 2168 2169 ntq_setup_inputs(c); 2170 ntq_setup_outputs(c); 2171 ntq_setup_uniforms(c); 2172 ntq_setup_registers(c, &c->s->registers); 2173 2174 /* Find the main function and emit the body. */ 2175 nir_foreach_function(function, c->s) { 2176 assert(strcmp(function->name, "main") == 0); 2177 assert(function->impl); 2178 ntq_emit_impl(c, function->impl); 2179 } 2180 } 2181 2182 static const nir_shader_compiler_options nir_options = { 2183 .lower_extract_byte = true, 2184 .lower_extract_word = true, 2185 .lower_ffma = true, 2186 .lower_flrp32 = true, 2187 .lower_fpow = true, 2188 .lower_fsat = true, 2189 .lower_fsqrt = true, 2190 .lower_negate = true, 2191 .native_integers = true, 2192 .max_unroll_iterations = 32, 2193 }; 2194 2195 const void * 2196 vc4_screen_get_compiler_options(struct pipe_screen *pscreen, 2197 enum pipe_shader_ir ir, 2198 enum pipe_shader_type shader) 2199 { 2200 return &nir_options; 2201 } 2202 2203 static int 2204 count_nir_instrs(nir_shader *nir) 2205 { 2206 int count = 0; 2207 nir_foreach_function(function, nir) { 2208 if (!function->impl) 2209 continue; 2210 nir_foreach_block(block, function->impl) { 2211 nir_foreach_instr(instr, block) 2212 count++; 2213 } 2214 } 2215 return count; 2216 } 2217 2218 static struct vc4_compile * 2219 vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, 2220 struct vc4_key *key, bool fs_threaded) 2221 { 2222 struct vc4_compile *c = qir_compile_init(); 2223 2224 c->vc4 = vc4; 2225 c->stage = stage; 2226 c->shader_state = &key->shader_state->base; 2227 c->program_id = key->shader_state->program_id; 2228 c->variant_id = 2229 p_atomic_inc_return(&key->shader_state->compiled_variant_count); 2230 c->fs_threaded = fs_threaded; 2231 2232 c->key = key; 2233 switch (stage) { 2234 case QSTAGE_FRAG: 2235 c->fs_key = (struct vc4_fs_key *)key; 2236 if (c->fs_key->is_points) { 2237 c->point_x = emit_fragment_varying(c, ~0, 0); 2238 c->point_y = emit_fragment_varying(c, ~0, 0); 2239 } else if (c->fs_key->is_lines) { 2240 c->line_x = emit_fragment_varying(c, ~0, 0); 2241 } 2242 break; 2243 case QSTAGE_VERT: 2244 c->vs_key = (struct vc4_vs_key *)key; 2245 break; 2246 case QSTAGE_COORD: 2247 c->vs_key = (struct vc4_vs_key *)key; 2248 break; 2249 } 2250 2251 c->s = nir_shader_clone(c, key->shader_state->base.ir.nir); 2252 2253 if (stage == QSTAGE_FRAG) { 2254 if (c->fs_key->alpha_test_func != COMPARE_FUNC_ALWAYS) { 2255 NIR_PASS_V(c->s, nir_lower_alpha_test, 2256 c->fs_key->alpha_test_func, 2257 c->fs_key->sample_alpha_to_one && 2258 c->fs_key->msaa); 2259 } 2260 NIR_PASS_V(c->s, vc4_nir_lower_blend, c); 2261 } 2262 2263 struct nir_lower_tex_options tex_options = { 2264 /* We would need to implement txs, but we don't want the 2265 * int/float conversions 2266 */ 2267 .lower_rect = false, 2268 2269 .lower_txp = ~0, 2270 2271 /* Apply swizzles to all samplers. */ 2272 .swizzle_result = ~0, 2273 }; 2274 2275 /* Lower the format swizzle and ARB_texture_swizzle-style swizzle. 2276 * The format swizzling applies before sRGB decode, and 2277 * ARB_texture_swizzle is the last thing before returning the sample. 2278 */ 2279 for (int i = 0; i < ARRAY_SIZE(key->tex); i++) { 2280 enum pipe_format format = c->key->tex[i].format; 2281 2282 if (!format) 2283 continue; 2284 2285 const uint8_t *format_swizzle = vc4_get_format_swizzle(format); 2286 2287 for (int j = 0; j < 4; j++) { 2288 uint8_t arb_swiz = c->key->tex[i].swizzle[j]; 2289 2290 if (arb_swiz <= 3) { 2291 tex_options.swizzles[i][j] = 2292 format_swizzle[arb_swiz]; 2293 } else { 2294 tex_options.swizzles[i][j] = arb_swiz; 2295 } 2296 } 2297 2298 if (util_format_is_srgb(format)) 2299 tex_options.lower_srgb |= (1 << i); 2300 } 2301 2302 NIR_PASS_V(c->s, nir_lower_tex, &tex_options); 2303 2304 if (c->fs_key && c->fs_key->light_twoside) 2305 NIR_PASS_V(c->s, nir_lower_two_sided_color); 2306 2307 if (c->vs_key && c->vs_key->clamp_color) 2308 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs); 2309 2310 if (c->key->ucp_enables) { 2311 if (stage == QSTAGE_FRAG) { 2312 NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables); 2313 } else { 2314 NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables); 2315 NIR_PASS_V(c->s, nir_lower_io_to_scalar, 2316 nir_var_shader_out); 2317 } 2318 } 2319 2320 /* FS input scalarizing must happen after nir_lower_two_sided_color, 2321 * which only handles a vec4 at a time. Similarly, VS output 2322 * scalarizing must happen after nir_lower_clip_vs. 2323 */ 2324 if (c->stage == QSTAGE_FRAG) 2325 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); 2326 else 2327 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 2328 2329 NIR_PASS_V(c->s, vc4_nir_lower_io, c); 2330 NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c); 2331 NIR_PASS_V(c->s, nir_lower_idiv); 2332 2333 vc4_optimize_nir(c->s); 2334 2335 NIR_PASS_V(c->s, nir_convert_from_ssa, true); 2336 2337 if (vc4_debug & VC4_DEBUG_SHADERDB) { 2338 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n", 2339 qir_get_stage_name(c->stage), 2340 c->program_id, c->variant_id, 2341 count_nir_instrs(c->s)); 2342 } 2343 2344 if (vc4_debug & VC4_DEBUG_NIR) { 2345 fprintf(stderr, "%s prog %d/%d NIR:\n", 2346 qir_get_stage_name(c->stage), 2347 c->program_id, c->variant_id); 2348 nir_print_shader(c->s, stderr); 2349 } 2350 2351 nir_to_qir(c); 2352 2353 switch (stage) { 2354 case QSTAGE_FRAG: 2355 /* FS threading requires that the thread execute 2356 * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating 2357 * (with no other THRSW afterwards, obviously). If we didn't 2358 * fetch a texture at a top level block, this wouldn't be 2359 * true. 2360 */ 2361 if (c->fs_threaded && !c->last_thrsw_at_top_level) { 2362 c->failed = true; 2363 return c; 2364 } 2365 2366 emit_frag_end(c); 2367 break; 2368 case QSTAGE_VERT: 2369 emit_vert_end(c, 2370 c->vs_key->fs_inputs->input_slots, 2371 c->vs_key->fs_inputs->num_inputs); 2372 break; 2373 case QSTAGE_COORD: 2374 emit_coord_end(c); 2375 break; 2376 } 2377 2378 if (vc4_debug & VC4_DEBUG_QIR) { 2379 fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n", 2380 qir_get_stage_name(c->stage), 2381 c->program_id, c->variant_id); 2382 qir_dump(c); 2383 fprintf(stderr, "\n"); 2384 } 2385 2386 qir_optimize(c); 2387 qir_lower_uniforms(c); 2388 2389 qir_schedule_instructions(c); 2390 qir_emit_uniform_stream_resets(c); 2391 2392 if (vc4_debug & VC4_DEBUG_QIR) { 2393 fprintf(stderr, "%s prog %d/%d QIR:\n", 2394 qir_get_stage_name(c->stage), 2395 c->program_id, c->variant_id); 2396 qir_dump(c); 2397 fprintf(stderr, "\n"); 2398 } 2399 2400 qir_reorder_uniforms(c); 2401 vc4_generate_code(vc4, c); 2402 2403 if (vc4_debug & VC4_DEBUG_SHADERDB) { 2404 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n", 2405 qir_get_stage_name(c->stage), 2406 c->program_id, c->variant_id, 2407 c->qpu_inst_count); 2408 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d uniforms\n", 2409 qir_get_stage_name(c->stage), 2410 c->program_id, c->variant_id, 2411 c->num_uniforms); 2412 } 2413 2414 ralloc_free(c->s); 2415 2416 return c; 2417 } 2418 2419 static void * 2420 vc4_shader_state_create(struct pipe_context *pctx, 2421 const struct pipe_shader_state *cso) 2422 { 2423 struct vc4_context *vc4 = vc4_context(pctx); 2424 struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader); 2425 if (!so) 2426 return NULL; 2427 2428 so->program_id = vc4->next_uncompiled_program_id++; 2429 2430 nir_shader *s; 2431 2432 if (cso->type == PIPE_SHADER_IR_NIR) { 2433 /* The backend takes ownership of the NIR shader on state 2434 * creation. 2435 */ 2436 s = cso->ir.nir; 2437 2438 NIR_PASS_V(s, nir_lower_io, nir_var_all, type_size, 2439 (nir_lower_io_options)0); 2440 } else { 2441 assert(cso->type == PIPE_SHADER_IR_TGSI); 2442 2443 if (vc4_debug & VC4_DEBUG_TGSI) { 2444 fprintf(stderr, "prog %d TGSI:\n", 2445 so->program_id); 2446 tgsi_dump(cso->tokens, 0); 2447 fprintf(stderr, "\n"); 2448 } 2449 s = tgsi_to_nir(cso->tokens, &nir_options); 2450 } 2451 2452 NIR_PASS_V(s, nir_opt_global_to_local); 2453 NIR_PASS_V(s, nir_lower_regs_to_ssa); 2454 NIR_PASS_V(s, nir_normalize_cubemap_coords); 2455 2456 NIR_PASS_V(s, nir_lower_load_const_to_scalar); 2457 2458 vc4_optimize_nir(s); 2459 2460 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_local); 2461 2462 /* Garbage collect dead instructions */ 2463 nir_sweep(s); 2464 2465 so->base.type = PIPE_SHADER_IR_NIR; 2466 so->base.ir.nir = s; 2467 2468 if (vc4_debug & VC4_DEBUG_NIR) { 2469 fprintf(stderr, "%s prog %d NIR:\n", 2470 gl_shader_stage_name(s->info.stage), 2471 so->program_id); 2472 nir_print_shader(s, stderr); 2473 fprintf(stderr, "\n"); 2474 } 2475 2476 return so; 2477 } 2478 2479 static void 2480 copy_uniform_state_to_shader(struct vc4_compiled_shader *shader, 2481 struct vc4_compile *c) 2482 { 2483 int count = c->num_uniforms; 2484 struct vc4_shader_uniform_info *uinfo = &shader->uniforms; 2485 2486 uinfo->count = count; 2487 uinfo->data = ralloc_array(shader, uint32_t, count); 2488 memcpy(uinfo->data, c->uniform_data, 2489 count * sizeof(*uinfo->data)); 2490 uinfo->contents = ralloc_array(shader, enum quniform_contents, count); 2491 memcpy(uinfo->contents, c->uniform_contents, 2492 count * sizeof(*uinfo->contents)); 2493 uinfo->num_texture_samples = c->num_texture_samples; 2494 2495 vc4_set_shader_uniform_dirty_flags(shader); 2496 } 2497 2498 static void 2499 vc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c, 2500 struct vc4_compiled_shader *shader) 2501 { 2502 struct vc4_fs_inputs inputs; 2503 2504 memset(&inputs, 0, sizeof(inputs)); 2505 inputs.input_slots = ralloc_array(shader, 2506 struct vc4_varying_slot, 2507 c->num_input_slots); 2508 2509 bool input_live[c->num_input_slots]; 2510 2511 memset(input_live, 0, sizeof(input_live)); 2512 qir_for_each_inst_inorder(inst, c) { 2513 for (int i = 0; i < qir_get_nsrc(inst); i++) { 2514 if (inst->src[i].file == QFILE_VARY) 2515 input_live[inst->src[i].index] = true; 2516 } 2517 } 2518 2519 for (int i = 0; i < c->num_input_slots; i++) { 2520 struct vc4_varying_slot *slot = &c->input_slots[i]; 2521 2522 if (!input_live[i]) 2523 continue; 2524 2525 /* Skip non-VS-output inputs. */ 2526 if (slot->slot == (uint8_t)~0) 2527 continue; 2528 2529 if (slot->slot == VARYING_SLOT_COL0 || 2530 slot->slot == VARYING_SLOT_COL1 || 2531 slot->slot == VARYING_SLOT_BFC0 || 2532 slot->slot == VARYING_SLOT_BFC1) { 2533 shader->color_inputs |= (1 << inputs.num_inputs); 2534 } 2535 2536 inputs.input_slots[inputs.num_inputs] = *slot; 2537 inputs.num_inputs++; 2538 } 2539 shader->num_inputs = inputs.num_inputs; 2540 2541 /* Add our set of inputs to the set of all inputs seen. This way, we 2542 * can have a single pointer that identifies an FS inputs set, 2543 * allowing VS to avoid recompiling when the FS is recompiled (or a 2544 * new one is bound using separate shader objects) but the inputs 2545 * don't change. 2546 */ 2547 struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs); 2548 if (entry) { 2549 shader->fs_inputs = entry->key; 2550 ralloc_free(inputs.input_slots); 2551 } else { 2552 struct vc4_fs_inputs *alloc_inputs; 2553 2554 alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs); 2555 memcpy(alloc_inputs, &inputs, sizeof(inputs)); 2556 ralloc_steal(alloc_inputs, inputs.input_slots); 2557 _mesa_set_add(vc4->fs_inputs_set, alloc_inputs); 2558 2559 shader->fs_inputs = alloc_inputs; 2560 } 2561 } 2562 2563 static struct vc4_compiled_shader * 2564 vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, 2565 struct vc4_key *key) 2566 { 2567 struct hash_table *ht; 2568 uint32_t key_size; 2569 bool try_threading; 2570 2571 if (stage == QSTAGE_FRAG) { 2572 ht = vc4->fs_cache; 2573 key_size = sizeof(struct vc4_fs_key); 2574 try_threading = vc4->screen->has_threaded_fs; 2575 } else { 2576 ht = vc4->vs_cache; 2577 key_size = sizeof(struct vc4_vs_key); 2578 try_threading = false; 2579 } 2580 2581 struct vc4_compiled_shader *shader; 2582 struct hash_entry *entry = _mesa_hash_table_search(ht, key); 2583 if (entry) 2584 return entry->data; 2585 2586 struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading); 2587 /* If the FS failed to compile threaded, fall back to single threaded. */ 2588 if (try_threading && c->failed) { 2589 qir_compile_destroy(c); 2590 c = vc4_shader_ntq(vc4, stage, key, false); 2591 } 2592 2593 shader = rzalloc(NULL, struct vc4_compiled_shader); 2594 2595 shader->program_id = vc4->next_compiled_program_id++; 2596 if (stage == QSTAGE_FRAG) { 2597 vc4_setup_compiled_fs_inputs(vc4, c, shader); 2598 2599 /* Note: the temporary clone in c->s has been freed. */ 2600 nir_shader *orig_shader = key->shader_state->base.ir.nir; 2601 if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH)) 2602 shader->disable_early_z = true; 2603 } else { 2604 shader->num_inputs = c->num_inputs; 2605 2606 shader->vattr_offsets[0] = 0; 2607 for (int i = 0; i < 8; i++) { 2608 shader->vattr_offsets[i + 1] = 2609 shader->vattr_offsets[i] + c->vattr_sizes[i]; 2610 2611 if (c->vattr_sizes[i]) 2612 shader->vattrs_live |= (1 << i); 2613 } 2614 } 2615 2616 shader->failed = c->failed; 2617 if (c->failed) { 2618 shader->failed = true; 2619 } else { 2620 copy_uniform_state_to_shader(shader, c); 2621 shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts, 2622 c->qpu_inst_count * 2623 sizeof(uint64_t)); 2624 } 2625 2626 shader->fs_threaded = c->fs_threaded; 2627 2628 /* Copy the compiler UBO range state to the compiled shader, dropping 2629 * out arrays that were never referenced by an indirect load. 2630 * 2631 * (Note that QIR dead code elimination of an array access still 2632 * leaves that array alive, though) 2633 */ 2634 if (c->num_ubo_ranges) { 2635 shader->num_ubo_ranges = c->num_ubo_ranges; 2636 shader->ubo_ranges = ralloc_array(shader, struct vc4_ubo_range, 2637 c->num_ubo_ranges); 2638 uint32_t j = 0; 2639 for (int i = 0; i < c->num_uniform_ranges; i++) { 2640 struct vc4_compiler_ubo_range *range = 2641 &c->ubo_ranges[i]; 2642 if (!range->used) 2643 continue; 2644 2645 shader->ubo_ranges[j].dst_offset = range->dst_offset; 2646 shader->ubo_ranges[j].src_offset = range->src_offset; 2647 shader->ubo_ranges[j].size = range->size; 2648 shader->ubo_size += c->ubo_ranges[i].size; 2649 j++; 2650 } 2651 } 2652 if (shader->ubo_size) { 2653 if (vc4_debug & VC4_DEBUG_SHADERDB) { 2654 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n", 2655 qir_get_stage_name(c->stage), 2656 c->program_id, c->variant_id, 2657 shader->ubo_size / 4); 2658 } 2659 } 2660 2661 if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) { 2662 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n", 2663 qir_get_stage_name(c->stage), 2664 c->program_id, c->variant_id, 2665 1 + shader->fs_threaded); 2666 } 2667 2668 qir_compile_destroy(c); 2669 2670 struct vc4_key *dup_key; 2671 dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */ 2672 memcpy(dup_key, key, key_size); 2673 _mesa_hash_table_insert(ht, dup_key, shader); 2674 2675 return shader; 2676 } 2677 2678 static void 2679 vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key, 2680 struct vc4_texture_stateobj *texstate) 2681 { 2682 for (int i = 0; i < texstate->num_textures; i++) { 2683 struct pipe_sampler_view *sampler = texstate->textures[i]; 2684 struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler); 2685 struct pipe_sampler_state *sampler_state = 2686 texstate->samplers[i]; 2687 2688 if (!sampler) 2689 continue; 2690 2691 key->tex[i].format = sampler->format; 2692 key->tex[i].swizzle[0] = sampler->swizzle_r; 2693 key->tex[i].swizzle[1] = sampler->swizzle_g; 2694 key->tex[i].swizzle[2] = sampler->swizzle_b; 2695 key->tex[i].swizzle[3] = sampler->swizzle_a; 2696 2697 if (sampler->texture->nr_samples > 1) { 2698 key->tex[i].msaa_width = sampler->texture->width0; 2699 key->tex[i].msaa_height = sampler->texture->height0; 2700 } else if (sampler){ 2701 key->tex[i].compare_mode = sampler_state->compare_mode; 2702 key->tex[i].compare_func = sampler_state->compare_func; 2703 key->tex[i].wrap_s = sampler_state->wrap_s; 2704 key->tex[i].wrap_t = sampler_state->wrap_t; 2705 key->tex[i].force_first_level = 2706 vc4_sampler->force_first_level; 2707 } 2708 } 2709 2710 key->ucp_enables = vc4->rasterizer->base.clip_plane_enable; 2711 } 2712 2713 static void 2714 vc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode) 2715 { 2716 struct vc4_job *job = vc4->job; 2717 struct vc4_fs_key local_key; 2718 struct vc4_fs_key *key = &local_key; 2719 2720 if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE | 2721 VC4_DIRTY_BLEND | 2722 VC4_DIRTY_FRAMEBUFFER | 2723 VC4_DIRTY_ZSA | 2724 VC4_DIRTY_RASTERIZER | 2725 VC4_DIRTY_SAMPLE_MASK | 2726 VC4_DIRTY_FRAGTEX | 2727 VC4_DIRTY_UNCOMPILED_FS))) { 2728 return; 2729 } 2730 2731 memset(key, 0, sizeof(*key)); 2732 vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex); 2733 key->base.shader_state = vc4->prog.bind_fs; 2734 key->is_points = (prim_mode == PIPE_PRIM_POINTS); 2735 key->is_lines = (prim_mode >= PIPE_PRIM_LINES && 2736 prim_mode <= PIPE_PRIM_LINE_STRIP); 2737 key->blend = vc4->blend->rt[0]; 2738 if (vc4->blend->logicop_enable) { 2739 key->logicop_func = vc4->blend->logicop_func; 2740 } else { 2741 key->logicop_func = PIPE_LOGICOP_COPY; 2742 } 2743 if (job->msaa) { 2744 key->msaa = vc4->rasterizer->base.multisample; 2745 key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1); 2746 key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage; 2747 key->sample_alpha_to_one = vc4->blend->alpha_to_one; 2748 } 2749 2750 if (vc4->framebuffer.cbufs[0]) 2751 key->color_format = vc4->framebuffer.cbufs[0]->format; 2752 2753 key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0; 2754 key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0; 2755 key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0; 2756 key->depth_enabled = (vc4->zsa->base.depth.enabled || 2757 key->stencil_enabled); 2758 if (vc4->zsa->base.alpha.enabled) 2759 key->alpha_test_func = vc4->zsa->base.alpha.func; 2760 else 2761 key->alpha_test_func = COMPARE_FUNC_ALWAYS; 2762 2763 if (key->is_points) { 2764 key->point_sprite_mask = 2765 vc4->rasterizer->base.sprite_coord_enable; 2766 key->point_coord_upper_left = 2767 (vc4->rasterizer->base.sprite_coord_mode == 2768 PIPE_SPRITE_COORD_UPPER_LEFT); 2769 } 2770 2771 key->light_twoside = vc4->rasterizer->base.light_twoside; 2772 2773 struct vc4_compiled_shader *old_fs = vc4->prog.fs; 2774 vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base); 2775 if (vc4->prog.fs == old_fs) 2776 return; 2777 2778 vc4->dirty |= VC4_DIRTY_COMPILED_FS; 2779 2780 if (vc4->rasterizer->base.flatshade && 2781 (!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) { 2782 vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS; 2783 } 2784 2785 if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs) 2786 vc4->dirty |= VC4_DIRTY_FS_INPUTS; 2787 } 2788 2789 static void 2790 vc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode) 2791 { 2792 struct vc4_vs_key local_key; 2793 struct vc4_vs_key *key = &local_key; 2794 2795 if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE | 2796 VC4_DIRTY_RASTERIZER | 2797 VC4_DIRTY_VERTTEX | 2798 VC4_DIRTY_VTXSTATE | 2799 VC4_DIRTY_UNCOMPILED_VS | 2800 VC4_DIRTY_FS_INPUTS))) { 2801 return; 2802 } 2803 2804 memset(key, 0, sizeof(*key)); 2805 vc4_setup_shared_key(vc4, &key->base, &vc4->verttex); 2806 key->base.shader_state = vc4->prog.bind_vs; 2807 key->fs_inputs = vc4->prog.fs->fs_inputs; 2808 key->clamp_color = vc4->rasterizer->base.clamp_vertex_color; 2809 2810 for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++) 2811 key->attr_formats[i] = vc4->vtx->pipe[i].src_format; 2812 2813 key->per_vertex_point_size = 2814 (prim_mode == PIPE_PRIM_POINTS && 2815 vc4->rasterizer->base.point_size_per_vertex); 2816 2817 struct vc4_compiled_shader *vs = 2818 vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); 2819 if (vs != vc4->prog.vs) { 2820 vc4->prog.vs = vs; 2821 vc4->dirty |= VC4_DIRTY_COMPILED_VS; 2822 } 2823 2824 key->is_coord = true; 2825 /* Coord shaders don't care what the FS inputs are. */ 2826 key->fs_inputs = NULL; 2827 struct vc4_compiled_shader *cs = 2828 vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); 2829 if (cs != vc4->prog.cs) { 2830 vc4->prog.cs = cs; 2831 vc4->dirty |= VC4_DIRTY_COMPILED_CS; 2832 } 2833 } 2834 2835 bool 2836 vc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode) 2837 { 2838 vc4_update_compiled_fs(vc4, prim_mode); 2839 vc4_update_compiled_vs(vc4, prim_mode); 2840 2841 return !(vc4->prog.cs->failed || 2842 vc4->prog.vs->failed || 2843 vc4->prog.fs->failed); 2844 } 2845 2846 static uint32_t 2847 fs_cache_hash(const void *key) 2848 { 2849 return _mesa_hash_data(key, sizeof(struct vc4_fs_key)); 2850 } 2851 2852 static uint32_t 2853 vs_cache_hash(const void *key) 2854 { 2855 return _mesa_hash_data(key, sizeof(struct vc4_vs_key)); 2856 } 2857 2858 static bool 2859 fs_cache_compare(const void *key1, const void *key2) 2860 { 2861 return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0; 2862 } 2863 2864 static bool 2865 vs_cache_compare(const void *key1, const void *key2) 2866 { 2867 return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0; 2868 } 2869 2870 static uint32_t 2871 fs_inputs_hash(const void *key) 2872 { 2873 const struct vc4_fs_inputs *inputs = key; 2874 2875 return _mesa_hash_data(inputs->input_slots, 2876 sizeof(*inputs->input_slots) * 2877 inputs->num_inputs); 2878 } 2879 2880 static bool 2881 fs_inputs_compare(const void *key1, const void *key2) 2882 { 2883 const struct vc4_fs_inputs *inputs1 = key1; 2884 const struct vc4_fs_inputs *inputs2 = key2; 2885 2886 return (inputs1->num_inputs == inputs2->num_inputs && 2887 memcmp(inputs1->input_slots, 2888 inputs2->input_slots, 2889 sizeof(*inputs1->input_slots) * 2890 inputs1->num_inputs) == 0); 2891 } 2892 2893 static void 2894 delete_from_cache_if_matches(struct hash_table *ht, 2895 struct vc4_compiled_shader **last_compile, 2896 struct hash_entry *entry, 2897 struct vc4_uncompiled_shader *so) 2898 { 2899 const struct vc4_key *key = entry->key; 2900 2901 if (key->shader_state == so) { 2902 struct vc4_compiled_shader *shader = entry->data; 2903 _mesa_hash_table_remove(ht, entry); 2904 vc4_bo_unreference(&shader->bo); 2905 2906 if (shader == *last_compile) 2907 *last_compile = NULL; 2908 2909 ralloc_free(shader); 2910 } 2911 } 2912 2913 static void 2914 vc4_shader_state_delete(struct pipe_context *pctx, void *hwcso) 2915 { 2916 struct vc4_context *vc4 = vc4_context(pctx); 2917 struct vc4_uncompiled_shader *so = hwcso; 2918 2919 struct hash_entry *entry; 2920 hash_table_foreach(vc4->fs_cache, entry) { 2921 delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs, 2922 entry, so); 2923 } 2924 hash_table_foreach(vc4->vs_cache, entry) { 2925 delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs, 2926 entry, so); 2927 } 2928 2929 ralloc_free(so->base.ir.nir); 2930 free(so); 2931 } 2932 2933 static void 2934 vc4_fp_state_bind(struct pipe_context *pctx, void *hwcso) 2935 { 2936 struct vc4_context *vc4 = vc4_context(pctx); 2937 vc4->prog.bind_fs = hwcso; 2938 vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS; 2939 } 2940 2941 static void 2942 vc4_vp_state_bind(struct pipe_context *pctx, void *hwcso) 2943 { 2944 struct vc4_context *vc4 = vc4_context(pctx); 2945 vc4->prog.bind_vs = hwcso; 2946 vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS; 2947 } 2948 2949 void 2950 vc4_program_init(struct pipe_context *pctx) 2951 { 2952 struct vc4_context *vc4 = vc4_context(pctx); 2953 2954 pctx->create_vs_state = vc4_shader_state_create; 2955 pctx->delete_vs_state = vc4_shader_state_delete; 2956 2957 pctx->create_fs_state = vc4_shader_state_create; 2958 pctx->delete_fs_state = vc4_shader_state_delete; 2959 2960 pctx->bind_fs_state = vc4_fp_state_bind; 2961 pctx->bind_vs_state = vc4_vp_state_bind; 2962 2963 vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash, 2964 fs_cache_compare); 2965 vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash, 2966 vs_cache_compare); 2967 vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash, 2968 fs_inputs_compare); 2969 } 2970 2971 void 2972 vc4_program_fini(struct pipe_context *pctx) 2973 { 2974 struct vc4_context *vc4 = vc4_context(pctx); 2975 2976 struct hash_entry *entry; 2977 hash_table_foreach(vc4->fs_cache, entry) { 2978 struct vc4_compiled_shader *shader = entry->data; 2979 vc4_bo_unreference(&shader->bo); 2980 ralloc_free(shader); 2981 _mesa_hash_table_remove(vc4->fs_cache, entry); 2982 } 2983 2984 hash_table_foreach(vc4->vs_cache, entry) { 2985 struct vc4_compiled_shader *shader = entry->data; 2986 vc4_bo_unreference(&shader->bo); 2987 ralloc_free(shader); 2988 _mesa_hash_table_remove(vc4->vs_cache, entry); 2989 } 2990 } 2991