Home | History | Annotate | Download | only in ir3
      1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
      2 
      3 /*
      4  * Copyright (C) 2015 Rob Clark <robclark (at) freedesktop.org>
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     23  * SOFTWARE.
     24  *
     25  * Authors:
     26  *    Rob Clark <robclark (at) freedesktop.org>
     27  */
     28 
     29 #include <stdarg.h>
     30 
     31 #include "pipe/p_state.h"
     32 #include "util/u_string.h"
     33 #include "util/u_memory.h"
     34 #include "util/u_inlines.h"
     35 
     36 #include "freedreno_util.h"
     37 
     38 #include "ir3_compiler.h"
     39 #include "ir3_shader.h"
     40 #include "ir3_nir.h"
     41 
     42 #include "instr-a3xx.h"
     43 #include "ir3.h"
     44 
     45 
     46 struct ir3_compile {
     47 	struct ir3_compiler *compiler;
     48 
     49 	struct nir_shader *s;
     50 
     51 	struct ir3 *ir;
     52 	struct ir3_shader_variant *so;
     53 
     54 	struct ir3_block *block;      /* the current block */
     55 	struct ir3_block *in_block;   /* block created for shader inputs */
     56 
     57 	nir_function_impl *impl;
     58 
     59 	/* For fragment shaders, from the hw perspective the only
     60 	 * actual input is r0.xy position register passed to bary.f.
     61 	 * But TGSI doesn't know that, it still declares things as
     62 	 * IN[] registers.  So we do all the input tracking normally
     63 	 * and fix things up after compile_instructions()
     64 	 *
     65 	 * NOTE that frag_pos is the hardware position (possibly it
     66 	 * is actually an index or tag or some such.. it is *not*
     67 	 * values that can be directly used for gl_FragCoord..)
     68 	 */
     69 	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
     70 
     71 	/* For vertex shaders, keep track of the system values sources */
     72 	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
     73 
     74 	/* mapping from nir_register to defining instruction: */
     75 	struct hash_table *def_ht;
     76 
     77 	unsigned num_arrays;
     78 
     79 	/* a common pattern for indirect addressing is to request the
     80 	 * same address register multiple times.  To avoid generating
     81 	 * duplicate instruction sequences (which our backend does not
     82 	 * try to clean up, since that should be done as the NIR stage)
     83 	 * we cache the address value generated for a given src value:
     84 	 */
     85 	struct hash_table *addr_ht;
     86 
     87 	/* maps nir_block to ir3_block, mostly for the purposes of
     88 	 * figuring out the blocks successors
     89 	 */
     90 	struct hash_table *block_ht;
     91 
     92 	/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
     93 	 * so we need to use ldlv.u32 to load the varying directly:
     94 	 */
     95 	bool flat_bypass;
     96 
     97 	/* on a3xx, we need to add one to # of array levels:
     98 	 */
     99 	bool levels_add_one;
    100 
    101 	/* on a3xx, we need to scale up integer coords for isaml based
    102 	 * on LoD:
    103 	 */
    104 	bool unminify_coords;
    105 
    106 	/* on a4xx, for array textures we need to add 0.5 to the array
    107 	 * index coordinate:
    108 	 */
    109 	bool array_index_add_half;
    110 
    111 	/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
    112 	unsigned astc_srgb;
    113 
    114 	unsigned max_texture_index;
    115 
    116 	/* set if we encounter something we can't handle yet, so we
    117 	 * can bail cleanly and fallback to TGSI compiler f/e
    118 	 */
    119 	bool error;
    120 };
    121 
    122 /* gpu pointer size in units of 32bit registers/slots */
    123 static unsigned pointer_size(struct ir3_compile *ctx)
    124 {
    125 	return (ctx->compiler->gpu_id >= 500) ? 2 : 1;
    126 }
    127 
    128 static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
    129 static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
    130 
    131 
    132 static struct ir3_compile *
    133 compile_init(struct ir3_compiler *compiler,
    134 		struct ir3_shader_variant *so)
    135 {
    136 	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
    137 
    138 	if (compiler->gpu_id >= 400) {
    139 		/* need special handling for "flat" */
    140 		ctx->flat_bypass = true;
    141 		ctx->levels_add_one = false;
    142 		ctx->unminify_coords = false;
    143 		ctx->array_index_add_half = true;
    144 
    145 		if (so->type == SHADER_VERTEX)
    146 			ctx->astc_srgb = so->key.vastc_srgb;
    147 		else if (so->type == SHADER_FRAGMENT)
    148 			ctx->astc_srgb = so->key.fastc_srgb;
    149 
    150 	} else {
    151 		/* no special handling for "flat" */
    152 		ctx->flat_bypass = false;
    153 		ctx->levels_add_one = true;
    154 		ctx->unminify_coords = true;
    155 		ctx->array_index_add_half = false;
    156 	}
    157 
    158 	ctx->compiler = compiler;
    159 	ctx->ir = so->ir;
    160 	ctx->so = so;
    161 	ctx->def_ht = _mesa_hash_table_create(ctx,
    162 			_mesa_hash_pointer, _mesa_key_pointer_equal);
    163 	ctx->block_ht = _mesa_hash_table_create(ctx,
    164 			_mesa_hash_pointer, _mesa_key_pointer_equal);
    165 
    166 	/* TODO: maybe generate some sort of bitmask of what key
    167 	 * lowers vs what shader has (ie. no need to lower
    168 	 * texture clamp lowering if no texture sample instrs)..
    169 	 * although should be done further up the stack to avoid
    170 	 * creating duplicate variants..
    171 	 */
    172 
    173 	if (ir3_key_lowers_nir(&so->key)) {
    174 		nir_shader *s = nir_shader_clone(ctx, so->shader->nir);
    175 		ctx->s = ir3_optimize_nir(so->shader, s, &so->key);
    176 	} else {
    177 		/* fast-path for shader key that lowers nothing in NIR: */
    178 		ctx->s = so->shader->nir;
    179 	}
    180 
    181 	if (fd_mesa_debug & FD_DBG_DISASM) {
    182 		DBG("dump nir%dv%d: type=%d, k={bp=%u,cts=%u,hp=%u}",
    183 			so->shader->id, so->id, so->type,
    184 			so->key.binning_pass, so->key.color_two_side,
    185 			so->key.half_precision);
    186 		nir_print_shader(ctx->s, stdout);
    187 	}
    188 
    189 	so->num_uniforms = ctx->s->num_uniforms;
    190 	so->num_ubos = ctx->s->info->num_ubos;
    191 
    192 	/* Layout of constant registers, each section aligned to vec4.  Note
    193 	 * that pointer size (ubo, etc) changes depending on generation.
    194 	 *
    195 	 *    user consts
    196 	 *    UBO addresses
    197 	 *    if (vertex shader) {
    198 	 *        driver params (IR3_DP_*)
    199 	 *        if (stream_output.num_outputs > 0)
    200 	 *           stream-out addresses
    201 	 *    }
    202 	 *    immediates
    203 	 *
    204 	 * Immediates go last mostly because they are inserted in the CP pass
    205 	 * after the nir -> ir3 frontend.
    206 	 */
    207 	unsigned constoff = align(ctx->s->num_uniforms, 4);
    208 	unsigned ptrsz = pointer_size(ctx);
    209 
    210 	memset(&so->constbase, ~0, sizeof(so->constbase));
    211 
    212 	if (so->num_ubos > 0) {
    213 		so->constbase.ubo = constoff;
    214 		constoff += align(ctx->s->info->num_ubos * ptrsz, 4) / 4;
    215 	}
    216 
    217 	if (so->type == SHADER_VERTEX) {
    218 		so->constbase.driver_param = constoff;
    219 		constoff += align(IR3_DP_COUNT, 4) / 4;
    220 
    221 		if ((compiler->gpu_id < 500) &&
    222 				so->shader->stream_output.num_outputs > 0) {
    223 			so->constbase.tfbo = constoff;
    224 			constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4;
    225 		}
    226 	}
    227 
    228 	so->constbase.immediate = constoff;
    229 
    230 	return ctx;
    231 }
    232 
    233 static void
    234 compile_error(struct ir3_compile *ctx, const char *format, ...)
    235 {
    236 	va_list ap;
    237 	va_start(ap, format);
    238 	_debug_vprintf(format, ap);
    239 	va_end(ap);
    240 	nir_print_shader(ctx->s, stdout);
    241 	ctx->error = true;
    242 	debug_assert(0);
    243 }
    244 
    245 #define compile_assert(ctx, cond) do { \
    246 		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
    247 	} while (0)
    248 
    249 static void
    250 compile_free(struct ir3_compile *ctx)
    251 {
    252 	ralloc_free(ctx);
    253 }
    254 
    255 static void
    256 declare_var(struct ir3_compile *ctx, nir_variable *var)
    257 {
    258 	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
    259 	struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
    260 	arr->id = ++ctx->num_arrays;
    261 	arr->length = length;
    262 	arr->var = var;
    263 	list_addtail(&arr->node, &ctx->ir->array_list);
    264 }
    265 
    266 static struct ir3_array *
    267 get_var(struct ir3_compile *ctx, nir_variable *var)
    268 {
    269 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
    270 		if (arr->var == var)
    271 			return arr;
    272 	}
    273 	compile_error(ctx, "bogus var: %s\n", var->name);
    274 	return NULL;
    275 }
    276 
    277 /* allocate a n element value array (to be populated by caller) and
    278  * insert in def_ht
    279  */
    280 static struct ir3_instruction **
    281 __get_dst(struct ir3_compile *ctx, void *key, unsigned n)
    282 {
    283 	struct ir3_instruction **value =
    284 		ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
    285 	_mesa_hash_table_insert(ctx->def_ht, key, value);
    286 	return value;
    287 }
    288 
    289 static struct ir3_instruction **
    290 get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
    291 {
    292 	compile_assert(ctx, dst->is_ssa);
    293 	if (dst->is_ssa) {
    294 		return __get_dst(ctx, &dst->ssa, n);
    295 	} else {
    296 		return __get_dst(ctx, dst->reg.reg, n);
    297 	}
    298 }
    299 
    300 static struct ir3_instruction **
    301 get_dst_ssa(struct ir3_compile *ctx, nir_ssa_def *dst, unsigned n)
    302 {
    303 	return __get_dst(ctx, dst, n);
    304 }
    305 
    306 static struct ir3_instruction * const *
    307 get_src(struct ir3_compile *ctx, nir_src *src)
    308 {
    309 	struct hash_entry *entry;
    310 	compile_assert(ctx, src->is_ssa);
    311 	if (src->is_ssa) {
    312 		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
    313 	} else {
    314 		entry = _mesa_hash_table_search(ctx->def_ht, src->reg.reg);
    315 	}
    316 	compile_assert(ctx, entry);
    317 	return entry->data;
    318 }
    319 
    320 static struct ir3_instruction *
    321 create_immed(struct ir3_block *block, uint32_t val)
    322 {
    323 	struct ir3_instruction *mov;
    324 
    325 	mov = ir3_instr_create(block, OPC_MOV);
    326 	mov->cat1.src_type = TYPE_U32;
    327 	mov->cat1.dst_type = TYPE_U32;
    328 	ir3_reg_create(mov, 0, 0);
    329 	ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
    330 
    331 	return mov;
    332 }
    333 
    334 static struct ir3_instruction *
    335 create_addr(struct ir3_block *block, struct ir3_instruction *src)
    336 {
    337 	struct ir3_instruction *instr, *immed;
    338 
    339 	/* TODO in at least some cases, the backend could probably be
    340 	 * made clever enough to propagate IR3_REG_HALF..
    341 	 */
    342 	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
    343 	instr->regs[0]->flags |= IR3_REG_HALF;
    344 
    345 	immed = create_immed(block, 2);
    346 	immed->regs[0]->flags |= IR3_REG_HALF;
    347 
    348 	instr = ir3_SHL_B(block, instr, 0, immed, 0);
    349 	instr->regs[0]->flags |= IR3_REG_HALF;
    350 	instr->regs[1]->flags |= IR3_REG_HALF;
    351 
    352 	instr = ir3_MOV(block, instr, TYPE_S16);
    353 	instr->regs[0]->num = regid(REG_A0, 0);
    354 	instr->regs[0]->flags |= IR3_REG_HALF;
    355 	instr->regs[1]->flags |= IR3_REG_HALF;
    356 
    357 	return instr;
    358 }
    359 
    360 /* caches addr values to avoid generating multiple cov/shl/mova
    361  * sequences for each use of a given NIR level src as address
    362  */
    363 static struct ir3_instruction *
    364 get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
    365 {
    366 	struct ir3_instruction *addr;
    367 
    368 	if (!ctx->addr_ht) {
    369 		ctx->addr_ht = _mesa_hash_table_create(ctx,
    370 				_mesa_hash_pointer, _mesa_key_pointer_equal);
    371 	} else {
    372 		struct hash_entry *entry;
    373 		entry = _mesa_hash_table_search(ctx->addr_ht, src);
    374 		if (entry)
    375 			return entry->data;
    376 	}
    377 
    378 	addr = create_addr(ctx->block, src);
    379 	_mesa_hash_table_insert(ctx->addr_ht, src, addr);
    380 
    381 	return addr;
    382 }
    383 
    384 static struct ir3_instruction *
    385 get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src)
    386 {
    387 	struct ir3_block *b = ctx->block;
    388 	struct ir3_instruction *cond;
    389 
    390 	/* NOTE: only cmps.*.* can write p0.x: */
    391 	cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
    392 	cond->cat2.condition = IR3_COND_NE;
    393 
    394 	/* condition always goes in predicate register: */
    395 	cond->regs[0]->num = regid(REG_P0, 0);
    396 
    397 	return cond;
    398 }
    399 
    400 static struct ir3_instruction *
    401 create_uniform(struct ir3_compile *ctx, unsigned n)
    402 {
    403 	struct ir3_instruction *mov;
    404 
    405 	mov = ir3_instr_create(ctx->block, OPC_MOV);
    406 	/* TODO get types right? */
    407 	mov->cat1.src_type = TYPE_F32;
    408 	mov->cat1.dst_type = TYPE_F32;
    409 	ir3_reg_create(mov, 0, 0);
    410 	ir3_reg_create(mov, n, IR3_REG_CONST);
    411 
    412 	return mov;
    413 }
    414 
    415 static struct ir3_instruction *
    416 create_uniform_indirect(struct ir3_compile *ctx, int n,
    417 		struct ir3_instruction *address)
    418 {
    419 	struct ir3_instruction *mov;
    420 
    421 	mov = ir3_instr_create(ctx->block, OPC_MOV);
    422 	mov->cat1.src_type = TYPE_U32;
    423 	mov->cat1.dst_type = TYPE_U32;
    424 	ir3_reg_create(mov, 0, 0);
    425 	ir3_reg_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n;
    426 
    427 	ir3_instr_set_address(mov, address);
    428 
    429 	return mov;
    430 }
    431 
    432 static struct ir3_instruction *
    433 create_collect(struct ir3_block *block, struct ir3_instruction **arr,
    434 		unsigned arrsz)
    435 {
    436 	struct ir3_instruction *collect;
    437 
    438 	if (arrsz == 0)
    439 		return NULL;
    440 
    441 	collect = ir3_instr_create2(block, OPC_META_FI, 1 + arrsz);
    442 	ir3_reg_create(collect, 0, 0);     /* dst */
    443 	for (unsigned i = 0; i < arrsz; i++)
    444 		ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i];
    445 
    446 	return collect;
    447 }
    448 
    449 static struct ir3_instruction *
    450 create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, int n,
    451 		struct ir3_instruction *address, struct ir3_instruction *collect)
    452 {
    453 	struct ir3_block *block = ctx->block;
    454 	struct ir3_instruction *mov;
    455 	struct ir3_register *src;
    456 
    457 	mov = ir3_instr_create(block, OPC_MOV);
    458 	mov->cat1.src_type = TYPE_U32;
    459 	mov->cat1.dst_type = TYPE_U32;
    460 	ir3_reg_create(mov, 0, 0);
    461 	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
    462 	src->instr = collect;
    463 	src->size  = arrsz;
    464 	src->array.offset = n;
    465 
    466 	ir3_instr_set_address(mov, address);
    467 
    468 	return mov;
    469 }
    470 
    471 /* relative (indirect) if address!=NULL */
    472 static struct ir3_instruction *
    473 create_var_load(struct ir3_compile *ctx, struct ir3_array *arr, int n,
    474 		struct ir3_instruction *address)
    475 {
    476 	struct ir3_block *block = ctx->block;
    477 	struct ir3_instruction *mov;
    478 	struct ir3_register *src;
    479 
    480 	mov = ir3_instr_create(block, OPC_MOV);
    481 	mov->cat1.src_type = TYPE_U32;
    482 	mov->cat1.dst_type = TYPE_U32;
    483 	ir3_reg_create(mov, 0, 0);
    484 	src = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
    485 			COND(address, IR3_REG_RELATIV));
    486 	src->instr = arr->last_write;
    487 	src->size  = arr->length;
    488 	src->array.id = arr->id;
    489 	src->array.offset = n;
    490 
    491 	if (address)
    492 		ir3_instr_set_address(mov, address);
    493 
    494 	arr->last_access = mov;
    495 
    496 	return mov;
    497 }
    498 
    499 /* relative (indirect) if address!=NULL */
    500 static struct ir3_instruction *
    501 create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n,
    502 		struct ir3_instruction *src, struct ir3_instruction *address)
    503 {
    504 	struct ir3_block *block = ctx->block;
    505 	struct ir3_instruction *mov;
    506 	struct ir3_register *dst;
    507 
    508 	mov = ir3_instr_create(block, OPC_MOV);
    509 	mov->cat1.src_type = TYPE_U32;
    510 	mov->cat1.dst_type = TYPE_U32;
    511 	dst = ir3_reg_create(mov, 0, IR3_REG_ARRAY |
    512 			COND(address, IR3_REG_RELATIV));
    513 	dst->instr = arr->last_access;
    514 	dst->size  = arr->length;
    515 	dst->array.id = arr->id;
    516 	dst->array.offset = n;
    517 	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
    518 
    519 	ir3_instr_set_address(mov, address);
    520 
    521 	arr->last_write = arr->last_access = mov;
    522 
    523 	return mov;
    524 }
    525 
    526 static struct ir3_instruction *
    527 create_input(struct ir3_block *block, unsigned n)
    528 {
    529 	struct ir3_instruction *in;
    530 
    531 	in = ir3_instr_create(block, OPC_META_INPUT);
    532 	in->inout.block = block;
    533 	ir3_reg_create(in, n, 0);
    534 
    535 	return in;
    536 }
    537 
    538 static struct ir3_instruction *
    539 create_frag_input(struct ir3_compile *ctx, bool use_ldlv)
    540 {
    541 	struct ir3_block *block = ctx->block;
    542 	struct ir3_instruction *instr;
    543 	/* actual inloc is assigned and fixed up later: */
    544 	struct ir3_instruction *inloc = create_immed(block, 0);
    545 
    546 	if (use_ldlv) {
    547 		instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
    548 		instr->cat6.type = TYPE_U32;
    549 		instr->cat6.iim_val = 1;
    550 	} else {
    551 		instr = ir3_BARY_F(block, inloc, 0, ctx->frag_pos, 0);
    552 		instr->regs[2]->wrmask = 0x3;
    553 	}
    554 
    555 	return instr;
    556 }
    557 
    558 static struct ir3_instruction *
    559 create_frag_coord(struct ir3_compile *ctx, unsigned comp)
    560 {
    561 	struct ir3_block *block = ctx->block;
    562 	struct ir3_instruction *instr;
    563 
    564 	compile_assert(ctx, !ctx->frag_coord[comp]);
    565 
    566 	ctx->frag_coord[comp] = create_input(ctx->block, 0);
    567 
    568 	switch (comp) {
    569 	case 0: /* .x */
    570 	case 1: /* .y */
    571 		/* for frag_coord, we get unsigned values.. we need
    572 		 * to subtract (integer) 8 and divide by 16 (right-
    573 		 * shift by 4) then convert to float:
    574 		 *
    575 		 *    sub.s tmp, src, 8
    576 		 *    shr.b tmp, tmp, 4
    577 		 *    mov.u32f32 dst, tmp
    578 		 *
    579 		 */
    580 		instr = ir3_SUB_S(block, ctx->frag_coord[comp], 0,
    581 				create_immed(block, 8), 0);
    582 		instr = ir3_SHR_B(block, instr, 0,
    583 				create_immed(block, 4), 0);
    584 		instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
    585 
    586 		return instr;
    587 	case 2: /* .z */
    588 	case 3: /* .w */
    589 	default:
    590 		/* seems that we can use these as-is: */
    591 		return ctx->frag_coord[comp];
    592 	}
    593 }
    594 
    595 static struct ir3_instruction *
    596 create_driver_param(struct ir3_compile *ctx, enum ir3_driver_param dp)
    597 {
    598 	/* first four vec4 sysval's reserved for UBOs: */
    599 	/* NOTE: dp is in scalar, but there can be >4 dp components: */
    600 	unsigned n = ctx->so->constbase.driver_param;
    601 	unsigned r = regid(n + dp / 4, dp % 4);
    602 	return create_uniform(ctx, r);
    603 }
    604 
    605 /* helper for instructions that produce multiple consecutive scalar
    606  * outputs which need to have a split/fanout meta instruction inserted
    607  */
    608 static void
    609 split_dest(struct ir3_block *block, struct ir3_instruction **dst,
    610 		struct ir3_instruction *src, unsigned base, unsigned n)
    611 {
    612 	struct ir3_instruction *prev = NULL;
    613 	for (int i = 0, j = 0; i < n; i++) {
    614 		struct ir3_instruction *split = ir3_instr_create(block, OPC_META_FO);
    615 		ir3_reg_create(split, 0, IR3_REG_SSA);
    616 		ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
    617 		split->fo.off = i + base;
    618 
    619 		if (prev) {
    620 			split->cp.left = prev;
    621 			split->cp.left_cnt++;
    622 			prev->cp.right = split;
    623 			prev->cp.right_cnt++;
    624 		}
    625 		prev = split;
    626 
    627 		if (src->regs[0]->wrmask & (1 << (i + base)))
    628 			dst[j++] = split;
    629 	}
    630 }
    631 
    632 /*
    633  * Adreno uses uint rather than having dedicated bool type,
    634  * which (potentially) requires some conversion, in particular
    635  * when using output of an bool instr to int input, or visa
    636  * versa.
    637  *
    638  *         | Adreno  |  NIR  |
    639  *  -------+---------+-------+-
    640  *   true  |    1    |  ~0   |
    641  *   false |    0    |   0   |
    642  *
    643  * To convert from an adreno bool (uint) to nir, use:
    644  *
    645  *    absneg.s dst, (neg)src
    646  *
    647  * To convert back in the other direction:
    648  *
    649  *    absneg.s dst, (abs)arc
    650  *
    651  * The CP step can clean up the absneg.s that cancel each other
    652  * out, and with a slight bit of extra cleverness (to recognize
    653  * the instructions which produce either a 0 or 1) can eliminate
    654  * the absneg.s's completely when an instruction that wants
    655  * 0/1 consumes the result.  For example, when a nir 'bcsel'
    656  * consumes the result of 'feq'.  So we should be able to get by
    657  * without a boolean resolve step, and without incuring any
    658  * extra penalty in instruction count.
    659  */
    660 
    661 /* NIR bool -> native (adreno): */
    662 static struct ir3_instruction *
    663 ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
    664 {
    665 	return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
    666 }
    667 
    668 /* native (adreno) -> NIR bool: */
    669 static struct ir3_instruction *
    670 ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
    671 {
    672 	return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
    673 }
    674 
    675 /*
    676  * alu/sfu instructions:
    677  */
    678 
    679 static void
    680 emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu)
    681 {
    682 	const nir_op_info *info = &nir_op_infos[alu->op];
    683 	struct ir3_instruction **dst, *src[info->num_inputs];
    684 	struct ir3_block *b = ctx->block;
    685 
    686 	dst = get_dst(ctx, &alu->dest.dest, MAX2(info->output_size, 1));
    687 
    688 	/* Vectors are special in that they have non-scalarized writemasks,
    689 	 * and just take the first swizzle channel for each argument in
    690 	 * order into each writemask channel.
    691 	 */
    692 	if ((alu->op == nir_op_vec2) ||
    693 			(alu->op == nir_op_vec3) ||
    694 			(alu->op == nir_op_vec4)) {
    695 
    696 		for (int i = 0; i < info->num_inputs; i++) {
    697 			nir_alu_src *asrc = &alu->src[i];
    698 
    699 			compile_assert(ctx, !asrc->abs);
    700 			compile_assert(ctx, !asrc->negate);
    701 
    702 			src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
    703 			if (!src[i])
    704 				src[i] = create_immed(ctx->block, 0);
    705 			dst[i] = ir3_MOV(b, src[i], TYPE_U32);
    706 		}
    707 
    708 		return;
    709 	}
    710 
    711 	/* General case: We can just grab the one used channel per src. */
    712 	for (int i = 0; i < info->num_inputs; i++) {
    713 		unsigned chan = ffs(alu->dest.write_mask) - 1;
    714 		nir_alu_src *asrc = &alu->src[i];
    715 
    716 		compile_assert(ctx, !asrc->abs);
    717 		compile_assert(ctx, !asrc->negate);
    718 
    719 		src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
    720 
    721 		compile_assert(ctx, src[i]);
    722 	}
    723 
    724 	switch (alu->op) {
    725 	case nir_op_f2i:
    726 		dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_S32);
    727 		break;
    728 	case nir_op_f2u:
    729 		dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_U32);
    730 		break;
    731 	case nir_op_i2f:
    732 		dst[0] = ir3_COV(b, src[0], TYPE_S32, TYPE_F32);
    733 		break;
    734 	case nir_op_u2f:
    735 		dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32);
    736 		break;
    737 	case nir_op_imov:
    738 		dst[0] = ir3_MOV(b, src[0], TYPE_S32);
    739 		break;
    740 	case nir_op_fmov:
    741 		dst[0] = ir3_MOV(b, src[0], TYPE_F32);
    742 		break;
    743 	case nir_op_f2b:
    744 		dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
    745 		dst[0]->cat2.condition = IR3_COND_NE;
    746 		dst[0] = ir3_n2b(b, dst[0]);
    747 		break;
    748 	case nir_op_b2f:
    749 		dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
    750 		break;
    751 	case nir_op_b2i:
    752 		dst[0] = ir3_b2n(b, src[0]);
    753 		break;
    754 	case nir_op_i2b:
    755 		dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
    756 		dst[0]->cat2.condition = IR3_COND_NE;
    757 		dst[0] = ir3_n2b(b, dst[0]);
    758 		break;
    759 
    760 	case nir_op_fneg:
    761 		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
    762 		break;
    763 	case nir_op_fabs:
    764 		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
    765 		break;
    766 	case nir_op_fmax:
    767 		dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
    768 		break;
    769 	case nir_op_fmin:
    770 		dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
    771 		break;
    772 	case nir_op_fmul:
    773 		dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
    774 		break;
    775 	case nir_op_fadd:
    776 		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
    777 		break;
    778 	case nir_op_fsub:
    779 		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
    780 		break;
    781 	case nir_op_ffma:
    782 		dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
    783 		break;
    784 	case nir_op_fddx:
    785 		dst[0] = ir3_DSX(b, src[0], 0);
    786 		dst[0]->cat5.type = TYPE_F32;
    787 		break;
    788 	case nir_op_fddy:
    789 		dst[0] = ir3_DSY(b, src[0], 0);
    790 		dst[0]->cat5.type = TYPE_F32;
    791 		break;
    792 		break;
    793 	case nir_op_flt:
    794 		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
    795 		dst[0]->cat2.condition = IR3_COND_LT;
    796 		dst[0] = ir3_n2b(b, dst[0]);
    797 		break;
    798 	case nir_op_fge:
    799 		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
    800 		dst[0]->cat2.condition = IR3_COND_GE;
    801 		dst[0] = ir3_n2b(b, dst[0]);
    802 		break;
    803 	case nir_op_feq:
    804 		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
    805 		dst[0]->cat2.condition = IR3_COND_EQ;
    806 		dst[0] = ir3_n2b(b, dst[0]);
    807 		break;
    808 	case nir_op_fne:
    809 		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
    810 		dst[0]->cat2.condition = IR3_COND_NE;
    811 		dst[0] = ir3_n2b(b, dst[0]);
    812 		break;
    813 	case nir_op_fceil:
    814 		dst[0] = ir3_CEIL_F(b, src[0], 0);
    815 		break;
    816 	case nir_op_ffloor:
    817 		dst[0] = ir3_FLOOR_F(b, src[0], 0);
    818 		break;
    819 	case nir_op_ftrunc:
    820 		dst[0] = ir3_TRUNC_F(b, src[0], 0);
    821 		break;
    822 	case nir_op_fround_even:
    823 		dst[0] = ir3_RNDNE_F(b, src[0], 0);
    824 		break;
    825 	case nir_op_fsign:
    826 		dst[0] = ir3_SIGN_F(b, src[0], 0);
    827 		break;
    828 
    829 	case nir_op_fsin:
    830 		dst[0] = ir3_SIN(b, src[0], 0);
    831 		break;
    832 	case nir_op_fcos:
    833 		dst[0] = ir3_COS(b, src[0], 0);
    834 		break;
    835 	case nir_op_frsq:
    836 		dst[0] = ir3_RSQ(b, src[0], 0);
    837 		break;
    838 	case nir_op_frcp:
    839 		dst[0] = ir3_RCP(b, src[0], 0);
    840 		break;
    841 	case nir_op_flog2:
    842 		dst[0] = ir3_LOG2(b, src[0], 0);
    843 		break;
    844 	case nir_op_fexp2:
    845 		dst[0] = ir3_EXP2(b, src[0], 0);
    846 		break;
    847 	case nir_op_fsqrt:
    848 		dst[0] = ir3_SQRT(b, src[0], 0);
    849 		break;
    850 
    851 	case nir_op_iabs:
    852 		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
    853 		break;
    854 	case nir_op_iadd:
    855 		dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
    856 		break;
    857 	case nir_op_iand:
    858 		dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
    859 		break;
    860 	case nir_op_imax:
    861 		dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
    862 		break;
    863 	case nir_op_umax:
    864 		dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
    865 		break;
    866 	case nir_op_imin:
    867 		dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
    868 		break;
    869 	case nir_op_umin:
    870 		dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
    871 		break;
    872 	case nir_op_imul:
    873 		/*
    874 		 * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
    875 		 *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
    876 		 *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
    877 		 *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
    878 		 */
    879 		dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
    880 					ir3_MADSH_M16(b, src[0], 0, src[1], 0,
    881 						ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
    882 		break;
    883 	case nir_op_ineg:
    884 		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
    885 		break;
    886 	case nir_op_inot:
    887 		dst[0] = ir3_NOT_B(b, src[0], 0);
    888 		break;
    889 	case nir_op_ior:
    890 		dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
    891 		break;
    892 	case nir_op_ishl:
    893 		dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
    894 		break;
    895 	case nir_op_ishr:
    896 		dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
    897 		break;
    898 	case nir_op_isign: {
    899 		/* maybe this would be sane to lower in nir.. */
    900 		struct ir3_instruction *neg, *pos;
    901 
    902 		neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
    903 		neg->cat2.condition = IR3_COND_LT;
    904 
    905 		pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
    906 		pos->cat2.condition = IR3_COND_GT;
    907 
    908 		dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
    909 
    910 		break;
    911 	}
    912 	case nir_op_isub:
    913 		dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
    914 		break;
    915 	case nir_op_ixor:
    916 		dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
    917 		break;
    918 	case nir_op_ushr:
    919 		dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
    920 		break;
    921 	case nir_op_ilt:
    922 		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
    923 		dst[0]->cat2.condition = IR3_COND_LT;
    924 		dst[0] = ir3_n2b(b, dst[0]);
    925 		break;
    926 	case nir_op_ige:
    927 		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
    928 		dst[0]->cat2.condition = IR3_COND_GE;
    929 		dst[0] = ir3_n2b(b, dst[0]);
    930 		break;
    931 	case nir_op_ieq:
    932 		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
    933 		dst[0]->cat2.condition = IR3_COND_EQ;
    934 		dst[0] = ir3_n2b(b, dst[0]);
    935 		break;
    936 	case nir_op_ine:
    937 		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
    938 		dst[0]->cat2.condition = IR3_COND_NE;
    939 		dst[0] = ir3_n2b(b, dst[0]);
    940 		break;
    941 	case nir_op_ult:
    942 		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
    943 		dst[0]->cat2.condition = IR3_COND_LT;
    944 		dst[0] = ir3_n2b(b, dst[0]);
    945 		break;
    946 	case nir_op_uge:
    947 		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
    948 		dst[0]->cat2.condition = IR3_COND_GE;
    949 		dst[0] = ir3_n2b(b, dst[0]);
    950 		break;
    951 
    952 	case nir_op_bcsel:
    953 		dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0);
    954 		break;
    955 
    956 	case nir_op_bit_count:
    957 		dst[0] = ir3_CBITS_B(b, src[0], 0);
    958 		break;
    959 	case nir_op_ifind_msb: {
    960 		struct ir3_instruction *cmp;
    961 		dst[0] = ir3_CLZ_S(b, src[0], 0);
    962 		cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0);
    963 		cmp->cat2.condition = IR3_COND_GE;
    964 		dst[0] = ir3_SEL_B32(b,
    965 				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
    966 				cmp, 0, dst[0], 0);
    967 		break;
    968 	}
    969 	case nir_op_ufind_msb:
    970 		dst[0] = ir3_CLZ_B(b, src[0], 0);
    971 		dst[0] = ir3_SEL_B32(b,
    972 				ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0,
    973 				src[0], 0, dst[0], 0);
    974 		break;
    975 	case nir_op_find_lsb:
    976 		dst[0] = ir3_BFREV_B(b, src[0], 0);
    977 		dst[0] = ir3_CLZ_B(b, dst[0], 0);
    978 		break;
    979 	case nir_op_bitfield_reverse:
    980 		dst[0] = ir3_BFREV_B(b, src[0], 0);
    981 		break;
    982 
    983 	default:
    984 		compile_error(ctx, "Unhandled ALU op: %s\n",
    985 				nir_op_infos[alu->op].name);
    986 		break;
    987 	}
    988 }
    989 
    990 /* handles direct/indirect UBO reads: */
    991 static void
    992 emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
    993 		struct ir3_instruction **dst)
    994 {
    995 	struct ir3_block *b = ctx->block;
    996 	struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1;
    997 	nir_const_value *const_offset;
    998 	/* UBO addresses are the first driver params: */
    999 	unsigned ubo = regid(ctx->so->constbase.ubo, 0);
   1000 	const unsigned ptrsz = pointer_size(ctx);
   1001 
   1002 	int off = 0;
   1003 
   1004 	/* First src is ubo index, which could either be an immed or not: */
   1005 	src0 = get_src(ctx, &intr->src[0])[0];
   1006 	if (is_same_type_mov(src0) &&
   1007 			(src0->regs[1]->flags & IR3_REG_IMMED)) {
   1008 		base_lo = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz));
   1009 		base_hi = create_uniform(ctx, ubo + (src0->regs[1]->iim_val * ptrsz) + 1);
   1010 	} else {
   1011 		base_lo = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0));
   1012 		base_hi = create_uniform_indirect(ctx, ubo + 1, get_addr(ctx, src0));
   1013 	}
   1014 
   1015 	/* note: on 32bit gpu's base_hi is ignored and DCE'd */
   1016 	addr = base_lo;
   1017 
   1018 	const_offset = nir_src_as_const_value(intr->src[1]);
   1019 	if (const_offset) {
   1020 		off += const_offset->u32[0];
   1021 	} else {
   1022 		/* For load_ubo_indirect, second src is indirect offset: */
   1023 		src1 = get_src(ctx, &intr->src[1])[0];
   1024 
   1025 		/* and add offset to addr: */
   1026 		addr = ir3_ADD_S(b, addr, 0, src1, 0);
   1027 	}
   1028 
   1029 	/* if offset is to large to encode in the ldg, split it out: */
   1030 	if ((off + (intr->num_components * 4)) > 1024) {
   1031 		/* split out the minimal amount to improve the odds that
   1032 		 * cp can fit the immediate in the add.s instruction:
   1033 		 */
   1034 		unsigned off2 = off + (intr->num_components * 4) - 1024;
   1035 		addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
   1036 		off -= off2;
   1037 	}
   1038 
   1039 	if (ptrsz == 2) {
   1040 		struct ir3_instruction *carry;
   1041 
   1042 		/* handle 32b rollover, ie:
   1043 		 *   if (addr < base_lo)
   1044 		 *      base_hi++
   1045 		 */
   1046 		carry = ir3_CMPS_U(b, addr, 0, base_lo, 0);
   1047 		carry->cat2.condition = IR3_COND_LT;
   1048 		base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0);
   1049 
   1050 		addr = create_collect(b, (struct ir3_instruction*[]){ addr, base_hi }, 2);
   1051 	}
   1052 
   1053 	for (int i = 0; i < intr->num_components; i++) {
   1054 		struct ir3_instruction *load =
   1055 				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
   1056 		load->cat6.type = TYPE_U32;
   1057 		load->cat6.src_offset = off + i * 4;     /* byte offset */
   1058 		dst[i] = load;
   1059 	}
   1060 }
   1061 
   1062 /* handles array reads: */
   1063 static void
   1064 emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
   1065 		struct ir3_instruction **dst)
   1066 {
   1067 	nir_deref_var *dvar = intr->variables[0];
   1068 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
   1069 	struct ir3_array *arr = get_var(ctx, dvar->var);
   1070 
   1071 	compile_assert(ctx, dvar->deref.child &&
   1072 		(dvar->deref.child->deref_type == nir_deref_type_array));
   1073 
   1074 	switch (darr->deref_array_type) {
   1075 	case nir_deref_array_type_direct:
   1076 		/* direct access does not require anything special: */
   1077 		for (int i = 0; i < intr->num_components; i++) {
   1078 			unsigned n = darr->base_offset * 4 + i;
   1079 			compile_assert(ctx, n < arr->length);
   1080 			dst[i] = create_var_load(ctx, arr, n, NULL);
   1081 		}
   1082 		break;
   1083 	case nir_deref_array_type_indirect: {
   1084 		/* for indirect, we need to collect all the array elements: */
   1085 		struct ir3_instruction *addr =
   1086 				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
   1087 		for (int i = 0; i < intr->num_components; i++) {
   1088 			unsigned n = darr->base_offset * 4 + i;
   1089 			compile_assert(ctx, n < arr->length);
   1090 			dst[i] = create_var_load(ctx, arr, n, addr);
   1091 		}
   1092 		break;
   1093 	}
   1094 	default:
   1095 		compile_error(ctx, "Unhandled load deref type: %u\n",
   1096 				darr->deref_array_type);
   1097 		break;
   1098 	}
   1099 }
   1100 
   1101 /* handles array writes: */
   1102 static void
   1103 emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
   1104 {
   1105 	nir_deref_var *dvar = intr->variables[0];
   1106 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
   1107 	struct ir3_array *arr = get_var(ctx, dvar->var);
   1108 	struct ir3_instruction *addr;
   1109 	struct ir3_instruction * const *src;
   1110 	unsigned wrmask = nir_intrinsic_write_mask(intr);
   1111 
   1112 	compile_assert(ctx, dvar->deref.child &&
   1113 		(dvar->deref.child->deref_type == nir_deref_type_array));
   1114 
   1115 	src = get_src(ctx, &intr->src[0]);
   1116 
   1117 	switch (darr->deref_array_type) {
   1118 	case nir_deref_array_type_direct:
   1119 		addr = NULL;
   1120 		break;
   1121 	case nir_deref_array_type_indirect:
   1122 		addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
   1123 		break;
   1124 	default:
   1125 		compile_error(ctx, "Unhandled store deref type: %u\n",
   1126 				darr->deref_array_type);
   1127 		return;
   1128 	}
   1129 
   1130 	for (int i = 0; i < intr->num_components; i++) {
   1131 		if (!(wrmask & (1 << i)))
   1132 			continue;
   1133 		unsigned n = darr->base_offset * 4 + i;
   1134 		compile_assert(ctx, n < arr->length);
   1135 		create_var_store(ctx, arr, n, src[i], addr);
   1136 	}
   1137 }
   1138 
   1139 static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
   1140 		struct ir3_instruction *instr)
   1141 {
   1142 	struct ir3_shader_variant *so = ctx->so;
   1143 	unsigned r = regid(so->inputs_count, 0);
   1144 	unsigned n = so->inputs_count++;
   1145 
   1146 	so->inputs[n].sysval = true;
   1147 	so->inputs[n].slot = slot;
   1148 	so->inputs[n].compmask = 1;
   1149 	so->inputs[n].regid = r;
   1150 	so->inputs[n].interpolate = INTERP_MODE_FLAT;
   1151 	so->total_in++;
   1152 
   1153 	ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
   1154 	ctx->ir->inputs[r] = instr;
   1155 }
   1156 
   1157 static void
   1158 emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
   1159 {
   1160 	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
   1161 	struct ir3_instruction **dst;
   1162 	struct ir3_instruction * const *src;
   1163 	struct ir3_block *b = ctx->block;
   1164 	nir_const_value *const_offset;
   1165 	int idx;
   1166 
   1167 	if (info->has_dest) {
   1168 		dst = get_dst(ctx, &intr->dest, intr->num_components);
   1169 	} else {
   1170 		dst = NULL;
   1171 	}
   1172 
   1173 	switch (intr->intrinsic) {
   1174 	case nir_intrinsic_load_uniform:
   1175 		idx = nir_intrinsic_base(intr);
   1176 		const_offset = nir_src_as_const_value(intr->src[0]);
   1177 		if (const_offset) {
   1178 			idx += const_offset->u32[0];
   1179 			for (int i = 0; i < intr->num_components; i++) {
   1180 				unsigned n = idx * 4 + i;
   1181 				dst[i] = create_uniform(ctx, n);
   1182 			}
   1183 		} else {
   1184 			src = get_src(ctx, &intr->src[0]);
   1185 			for (int i = 0; i < intr->num_components; i++) {
   1186 				int n = idx * 4 + i;
   1187 				dst[i] = create_uniform_indirect(ctx, n,
   1188 						get_addr(ctx, src[0]));
   1189 			}
   1190 			/* NOTE: if relative addressing is used, we set
   1191 			 * constlen in the compiler (to worst-case value)
   1192 			 * since we don't know in the assembler what the max
   1193 			 * addr reg value can be:
   1194 			 */
   1195 			ctx->so->constlen = ctx->s->num_uniforms;
   1196 		}
   1197 		break;
   1198 	case nir_intrinsic_load_ubo:
   1199 		emit_intrinsic_load_ubo(ctx, intr, dst);
   1200 		break;
   1201 	case nir_intrinsic_load_input:
   1202 		idx = nir_intrinsic_base(intr);
   1203 		const_offset = nir_src_as_const_value(intr->src[0]);
   1204 		if (const_offset) {
   1205 			idx += const_offset->u32[0];
   1206 			for (int i = 0; i < intr->num_components; i++) {
   1207 				unsigned n = idx * 4 + i;
   1208 				dst[i] = ctx->ir->inputs[n];
   1209 			}
   1210 		} else {
   1211 			src = get_src(ctx, &intr->src[0]);
   1212 			struct ir3_instruction *collect =
   1213 					create_collect(b, ctx->ir->inputs, ctx->ir->ninputs);
   1214 			struct ir3_instruction *addr = get_addr(ctx, src[0]);
   1215 			for (int i = 0; i < intr->num_components; i++) {
   1216 				unsigned n = idx * 4 + i;
   1217 				dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
   1218 						n, addr, collect);
   1219 			}
   1220 		}
   1221 		break;
   1222 	case nir_intrinsic_load_var:
   1223 		emit_intrinsic_load_var(ctx, intr, dst);
   1224 		break;
   1225 	case nir_intrinsic_store_var:
   1226 		emit_intrinsic_store_var(ctx, intr);
   1227 		break;
   1228 	case nir_intrinsic_store_output:
   1229 		idx = nir_intrinsic_base(intr);
   1230 		const_offset = nir_src_as_const_value(intr->src[1]);
   1231 		compile_assert(ctx, const_offset != NULL);
   1232 		idx += const_offset->u32[0];
   1233 
   1234 		src = get_src(ctx, &intr->src[0]);
   1235 		for (int i = 0; i < intr->num_components; i++) {
   1236 			unsigned n = idx * 4 + i;
   1237 			ctx->ir->outputs[n] = src[i];
   1238 		}
   1239 		break;
   1240 	case nir_intrinsic_load_base_vertex:
   1241 		if (!ctx->basevertex) {
   1242 			ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
   1243 			add_sysval_input(ctx, SYSTEM_VALUE_BASE_VERTEX,
   1244 					ctx->basevertex);
   1245 		}
   1246 		dst[0] = ctx->basevertex;
   1247 		break;
   1248 	case nir_intrinsic_load_vertex_id_zero_base:
   1249 	case nir_intrinsic_load_vertex_id:
   1250 		if (!ctx->vertex_id) {
   1251 			gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ?
   1252 				SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
   1253 			ctx->vertex_id = create_input(b, 0);
   1254 			add_sysval_input(ctx, sv, ctx->vertex_id);
   1255 		}
   1256 		dst[0] = ctx->vertex_id;
   1257 		break;
   1258 	case nir_intrinsic_load_instance_id:
   1259 		if (!ctx->instance_id) {
   1260 			ctx->instance_id = create_input(b, 0);
   1261 			add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID,
   1262 					ctx->instance_id);
   1263 		}
   1264 		dst[0] = ctx->instance_id;
   1265 		break;
   1266 	case nir_intrinsic_load_user_clip_plane:
   1267 		idx = nir_intrinsic_ucp_id(intr);
   1268 		for (int i = 0; i < intr->num_components; i++) {
   1269 			unsigned n = idx * 4 + i;
   1270 			dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
   1271 		}
   1272 		break;
   1273 	case nir_intrinsic_load_front_face:
   1274 		if (!ctx->frag_face) {
   1275 			ctx->so->frag_face = true;
   1276 			ctx->frag_face = create_input(b, 0);
   1277 			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
   1278 		}
   1279 		/* for fragface, we always get -1 or 0, but that is inverse
   1280 		 * of what nir expects (where ~0 is true).  Unfortunately
   1281 		 * trying to widen from half to full in add.s seems to do a
   1282 		 * non-sign-extending widen (resulting in something that
   1283 		 * gets interpreted as float Inf??)
   1284 		 */
   1285 		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
   1286 		dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0);
   1287 		break;
   1288 	case nir_intrinsic_discard_if:
   1289 	case nir_intrinsic_discard: {
   1290 		struct ir3_instruction *cond, *kill;
   1291 
   1292 		if (intr->intrinsic == nir_intrinsic_discard_if) {
   1293 			/* conditional discard: */
   1294 			src = get_src(ctx, &intr->src[0]);
   1295 			cond = ir3_b2n(b, src[0]);
   1296 		} else {
   1297 			/* unconditional discard: */
   1298 			cond = create_immed(b, 1);
   1299 		}
   1300 
   1301 		/* NOTE: only cmps.*.* can write p0.x: */
   1302 		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
   1303 		cond->cat2.condition = IR3_COND_NE;
   1304 
   1305 		/* condition always goes in predicate register: */
   1306 		cond->regs[0]->num = regid(REG_P0, 0);
   1307 
   1308 		kill = ir3_KILL(b, cond, 0);
   1309 		array_insert(ctx->ir->predicates, kill);
   1310 
   1311 		array_insert(ctx->ir->keeps, kill);
   1312 		ctx->so->has_kill = true;
   1313 
   1314 		break;
   1315 	}
   1316 	default:
   1317 		compile_error(ctx, "Unhandled intrinsic type: %s\n",
   1318 				nir_intrinsic_infos[intr->intrinsic].name);
   1319 		break;
   1320 	}
   1321 }
   1322 
   1323 static void
   1324 emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr)
   1325 {
   1326 	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
   1327 			instr->def.num_components);
   1328 	for (int i = 0; i < instr->def.num_components; i++)
   1329 		dst[i] = create_immed(ctx->block, instr->value.u32[i]);
   1330 }
   1331 
   1332 static void
   1333 emit_undef(struct ir3_compile *ctx, nir_ssa_undef_instr *undef)
   1334 {
   1335 	struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
   1336 			undef->def.num_components);
   1337 	/* backend doesn't want undefined instructions, so just plug
   1338 	 * in 0.0..
   1339 	 */
   1340 	for (int i = 0; i < undef->def.num_components; i++)
   1341 		dst[i] = create_immed(ctx->block, fui(0.0));
   1342 }
   1343 
   1344 /*
   1345  * texture fetch/sample instructions:
   1346  */
   1347 
   1348 static void
   1349 tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
   1350 {
   1351 	unsigned coords, flags = 0;
   1352 
   1353 	/* note: would use tex->coord_components.. except txs.. also,
   1354 	 * since array index goes after shadow ref, we don't want to
   1355 	 * count it:
   1356 	 */
   1357 	switch (tex->sampler_dim) {
   1358 	case GLSL_SAMPLER_DIM_1D:
   1359 	case GLSL_SAMPLER_DIM_BUF:
   1360 		coords = 1;
   1361 		break;
   1362 	case GLSL_SAMPLER_DIM_2D:
   1363 	case GLSL_SAMPLER_DIM_RECT:
   1364 	case GLSL_SAMPLER_DIM_EXTERNAL:
   1365 	case GLSL_SAMPLER_DIM_MS:
   1366 		coords = 2;
   1367 		break;
   1368 	case GLSL_SAMPLER_DIM_3D:
   1369 	case GLSL_SAMPLER_DIM_CUBE:
   1370 		coords = 3;
   1371 		flags |= IR3_INSTR_3D;
   1372 		break;
   1373 	default:
   1374 		unreachable("bad sampler_dim");
   1375 	}
   1376 
   1377 	if (tex->is_shadow && tex->op != nir_texop_lod)
   1378 		flags |= IR3_INSTR_S;
   1379 
   1380 	if (tex->is_array && tex->op != nir_texop_lod)
   1381 		flags |= IR3_INSTR_A;
   1382 
   1383 	*flagsp = flags;
   1384 	*coordsp = coords;
   1385 }
   1386 
   1387 static void
   1388 emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
   1389 {
   1390 	struct ir3_block *b = ctx->block;
   1391 	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
   1392 	struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
   1393 	struct ir3_instruction *lod, *compare, *proj;
   1394 	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
   1395 	unsigned i, coords, flags;
   1396 	unsigned nsrc0 = 0, nsrc1 = 0;
   1397 	type_t type;
   1398 	opc_t opc = 0;
   1399 
   1400 	coord = off = ddx = ddy = NULL;
   1401 	lod = proj = compare = NULL;
   1402 
   1403 	/* TODO: might just be one component for gathers? */
   1404 	dst = get_dst(ctx, &tex->dest, 4);
   1405 
   1406 	for (unsigned i = 0; i < tex->num_srcs; i++) {
   1407 		switch (tex->src[i].src_type) {
   1408 		case nir_tex_src_coord:
   1409 			coord = get_src(ctx, &tex->src[i].src);
   1410 			break;
   1411 		case nir_tex_src_bias:
   1412 			lod = get_src(ctx, &tex->src[i].src)[0];
   1413 			has_bias = true;
   1414 			break;
   1415 		case nir_tex_src_lod:
   1416 			lod = get_src(ctx, &tex->src[i].src)[0];
   1417 			has_lod = true;
   1418 			break;
   1419 		case nir_tex_src_comparator: /* shadow comparator */
   1420 			compare = get_src(ctx, &tex->src[i].src)[0];
   1421 			break;
   1422 		case nir_tex_src_projector:
   1423 			proj = get_src(ctx, &tex->src[i].src)[0];
   1424 			has_proj = true;
   1425 			break;
   1426 		case nir_tex_src_offset:
   1427 			off = get_src(ctx, &tex->src[i].src);
   1428 			has_off = true;
   1429 			break;
   1430 		case nir_tex_src_ddx:
   1431 			ddx = get_src(ctx, &tex->src[i].src);
   1432 			break;
   1433 		case nir_tex_src_ddy:
   1434 			ddy = get_src(ctx, &tex->src[i].src);
   1435 			break;
   1436 		default:
   1437 			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
   1438 					tex->src[i].src_type);
   1439 			return;
   1440 		}
   1441 	}
   1442 
   1443 	switch (tex->op) {
   1444 	case nir_texop_tex:      opc = OPC_SAM;      break;
   1445 	case nir_texop_txb:      opc = OPC_SAMB;     break;
   1446 	case nir_texop_txl:      opc = OPC_SAML;     break;
   1447 	case nir_texop_txd:      opc = OPC_SAMGQ;    break;
   1448 	case nir_texop_txf:      opc = OPC_ISAML;    break;
   1449 	case nir_texop_lod:      opc = OPC_GETLOD;   break;
   1450 	case nir_texop_txf_ms:
   1451 	case nir_texop_txs:
   1452 	case nir_texop_tg4:
   1453 	case nir_texop_query_levels:
   1454 	case nir_texop_texture_samples:
   1455 	case nir_texop_samples_identical:
   1456 	case nir_texop_txf_ms_mcs:
   1457 		compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
   1458 		return;
   1459 	}
   1460 
   1461 	tex_info(tex, &flags, &coords);
   1462 
   1463 	/*
   1464 	 * lay out the first argument in the proper order:
   1465 	 *  - actual coordinates first
   1466 	 *  - shadow reference
   1467 	 *  - array index
   1468 	 *  - projection w
   1469 	 *  - starting at offset 4, dpdx.xy, dpdy.xy
   1470 	 *
   1471 	 * bias/lod go into the second arg
   1472 	 */
   1473 
   1474 	/* insert tex coords: */
   1475 	for (i = 0; i < coords; i++)
   1476 		src0[i] = coord[i];
   1477 
   1478 	nsrc0 = i;
   1479 
   1480 	/* scale up integer coords for TXF based on the LOD */
   1481 	if (ctx->unminify_coords && (opc == OPC_ISAML)) {
   1482 		assert(has_lod);
   1483 		for (i = 0; i < coords; i++)
   1484 			src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0);
   1485 	}
   1486 
   1487 	if (coords == 1) {
   1488 		/* hw doesn't do 1d, so we treat it as 2d with
   1489 		 * height of 1, and patch up the y coord.
   1490 		 * TODO: y coord should be (int)0 in some cases..
   1491 		 */
   1492 		src0[nsrc0++] = create_immed(b, fui(0.5));
   1493 	}
   1494 
   1495 	if (tex->is_shadow && tex->op != nir_texop_lod)
   1496 		src0[nsrc0++] = compare;
   1497 
   1498 	if (tex->is_array && tex->op != nir_texop_lod) {
   1499 		struct ir3_instruction *idx = coord[coords];
   1500 
   1501 		/* the array coord for cube arrays needs 0.5 added to it */
   1502 		if (ctx->array_index_add_half && (opc != OPC_ISAML))
   1503 			idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0);
   1504 
   1505 		src0[nsrc0++] = idx;
   1506 	}
   1507 
   1508 	if (has_proj) {
   1509 		src0[nsrc0++] = proj;
   1510 		flags |= IR3_INSTR_P;
   1511 	}
   1512 
   1513 	/* pad to 4, then ddx/ddy: */
   1514 	if (tex->op == nir_texop_txd) {
   1515 		while (nsrc0 < 4)
   1516 			src0[nsrc0++] = create_immed(b, fui(0.0));
   1517 		for (i = 0; i < coords; i++)
   1518 			src0[nsrc0++] = ddx[i];
   1519 		if (coords < 2)
   1520 			src0[nsrc0++] = create_immed(b, fui(0.0));
   1521 		for (i = 0; i < coords; i++)
   1522 			src0[nsrc0++] = ddy[i];
   1523 		if (coords < 2)
   1524 			src0[nsrc0++] = create_immed(b, fui(0.0));
   1525 	}
   1526 
   1527 	/*
   1528 	 * second argument (if applicable):
   1529 	 *  - offsets
   1530 	 *  - lod
   1531 	 *  - bias
   1532 	 */
   1533 	if (has_off | has_lod | has_bias) {
   1534 		if (has_off) {
   1535 			for (i = 0; i < coords; i++)
   1536 				src1[nsrc1++] = off[i];
   1537 			if (coords < 2)
   1538 				src1[nsrc1++] = create_immed(b, fui(0.0));
   1539 			flags |= IR3_INSTR_O;
   1540 		}
   1541 
   1542 		if (has_lod | has_bias)
   1543 			src1[nsrc1++] = lod;
   1544 	}
   1545 
   1546 	switch (tex->dest_type) {
   1547 	case nir_type_invalid:
   1548 	case nir_type_float:
   1549 		type = TYPE_F32;
   1550 		break;
   1551 	case nir_type_int:
   1552 		type = TYPE_S32;
   1553 		break;
   1554 	case nir_type_uint:
   1555 	case nir_type_bool:
   1556 		type = TYPE_U32;
   1557 		break;
   1558 	default:
   1559 		unreachable("bad dest_type");
   1560 	}
   1561 
   1562 	if (opc == OPC_GETLOD)
   1563 		type = TYPE_U32;
   1564 
   1565 	unsigned tex_idx = tex->texture_index;
   1566 
   1567 	ctx->max_texture_index = MAX2(ctx->max_texture_index, tex_idx);
   1568 
   1569 	struct ir3_instruction *col0 = create_collect(b, src0, nsrc0);
   1570 	struct ir3_instruction *col1 = create_collect(b, src1, nsrc1);
   1571 
   1572 	sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, flags,
   1573 			tex_idx, tex_idx, col0, col1);
   1574 
   1575 	if ((ctx->astc_srgb & (1 << tex_idx)) && !nir_tex_instr_is_query(tex)) {
   1576 		/* only need first 3 components: */
   1577 		sam->regs[0]->wrmask = 0x7;
   1578 		split_dest(b, dst, sam, 0, 3);
   1579 
   1580 		/* we need to sample the alpha separately with a non-ASTC
   1581 		 * texture state:
   1582 		 */
   1583 		sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_W, flags,
   1584 				tex_idx, tex_idx, col0, col1);
   1585 
   1586 		array_insert(ctx->ir->astc_srgb, sam);
   1587 
   1588 		/* fixup .w component: */
   1589 		split_dest(b, &dst[3], sam, 3, 1);
   1590 	} else {
   1591 		/* normal (non-workaround) case: */
   1592 		split_dest(b, dst, sam, 0, 4);
   1593 	}
   1594 
   1595 	/* GETLOD returns results in 4.8 fixed point */
   1596 	if (opc == OPC_GETLOD) {
   1597 		struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256));
   1598 
   1599 		compile_assert(ctx, tex->dest_type == nir_type_float);
   1600 		for (i = 0; i < 2; i++) {
   1601 			dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0,
   1602 							   factor, 0);
   1603 		}
   1604 	}
   1605 }
   1606 
   1607 static void
   1608 emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex)
   1609 {
   1610 	struct ir3_block *b = ctx->block;
   1611 	struct ir3_instruction **dst, *sam;
   1612 
   1613 	dst = get_dst(ctx, &tex->dest, 1);
   1614 
   1615 	sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0,
   1616 			tex->texture_index, tex->texture_index, NULL, NULL);
   1617 
   1618 	/* even though there is only one component, since it ends
   1619 	 * up in .z rather than .x, we need a split_dest()
   1620 	 */
   1621 	split_dest(b, dst, sam, 0, 3);
   1622 
   1623 	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
   1624 	 * the value in TEX_CONST_0 is zero-based.
   1625 	 */
   1626 	if (ctx->levels_add_one)
   1627 		dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
   1628 }
   1629 
   1630 static void
   1631 emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
   1632 {
   1633 	struct ir3_block *b = ctx->block;
   1634 	struct ir3_instruction **dst, *sam;
   1635 	struct ir3_instruction *lod;
   1636 	unsigned flags, coords;
   1637 
   1638 	tex_info(tex, &flags, &coords);
   1639 
   1640 	/* Actually we want the number of dimensions, not coordinates. This
   1641 	 * distinction only matters for cubes.
   1642 	 */
   1643 	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
   1644 		coords = 2;
   1645 
   1646 	dst = get_dst(ctx, &tex->dest, 4);
   1647 
   1648 	compile_assert(ctx, tex->num_srcs == 1);
   1649 	compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
   1650 
   1651 	lod = get_src(ctx, &tex->src[0].src)[0];
   1652 
   1653 	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
   1654 			tex->texture_index, tex->texture_index, lod, NULL);
   1655 
   1656 	split_dest(b, dst, sam, 0, 4);
   1657 
   1658 	/* Array size actually ends up in .w rather than .z. This doesn't
   1659 	 * matter for miplevel 0, but for higher mips the value in z is
   1660 	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
   1661 	 * returned, which means that we have to add 1 to it for arrays.
   1662 	 */
   1663 	if (tex->is_array) {
   1664 		if (ctx->levels_add_one) {
   1665 			dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
   1666 		} else {
   1667 			dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
   1668 		}
   1669 	}
   1670 }
   1671 
   1672 static void
   1673 emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi)
   1674 {
   1675 	struct ir3_instruction *phi, **dst;
   1676 
   1677 	/* NOTE: phi's should be lowered to scalar at this point */
   1678 	compile_assert(ctx, nphi->dest.ssa.num_components == 1);
   1679 
   1680 	dst = get_dst(ctx, &nphi->dest, 1);
   1681 
   1682 	phi = ir3_instr_create2(ctx->block, OPC_META_PHI,
   1683 			1 + exec_list_length(&nphi->srcs));
   1684 	ir3_reg_create(phi, 0, 0);         /* dst */
   1685 	phi->phi.nphi = nphi;
   1686 
   1687 	dst[0] = phi;
   1688 }
   1689 
   1690 /* phi instructions are left partially constructed.  We don't resolve
   1691  * their srcs until the end of the block, since (eg. loops) one of
   1692  * the phi's srcs might be defined after the phi due to back edges in
   1693  * the CFG.
   1694  */
   1695 static void
   1696 resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
   1697 {
   1698 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
   1699 		nir_phi_instr *nphi;
   1700 
   1701 		/* phi's only come at start of block: */
   1702 		if (instr->opc != OPC_META_PHI)
   1703 			break;
   1704 
   1705 		if (!instr->phi.nphi)
   1706 			break;
   1707 
   1708 		nphi = instr->phi.nphi;
   1709 		instr->phi.nphi = NULL;
   1710 
   1711 		foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) {
   1712 			struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0];
   1713 
   1714 			/* NOTE: src might not be in the same block as it comes from
   1715 			 * according to the phi.. but in the end the backend assumes
   1716 			 * it will be able to assign the same register to each (which
   1717 			 * only works if it is assigned in the src block), so insert
   1718 			 * an extra mov to make sure the phi src is assigned in the
   1719 			 * block it comes from:
   1720 			 */
   1721 			src = ir3_MOV(get_block(ctx, nsrc->pred), src, TYPE_U32);
   1722 
   1723 			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
   1724 		}
   1725 	}
   1726 }
   1727 
   1728 static void
   1729 emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump)
   1730 {
   1731 	switch (jump->type) {
   1732 	case nir_jump_break:
   1733 	case nir_jump_continue:
   1734 		/* I *think* we can simply just ignore this, and use the
   1735 		 * successor block link to figure out where we need to
   1736 		 * jump to for break/continue
   1737 		 */
   1738 		break;
   1739 	default:
   1740 		compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
   1741 		break;
   1742 	}
   1743 }
   1744 
   1745 static void
   1746 emit_instr(struct ir3_compile *ctx, nir_instr *instr)
   1747 {
   1748 	switch (instr->type) {
   1749 	case nir_instr_type_alu:
   1750 		emit_alu(ctx, nir_instr_as_alu(instr));
   1751 		break;
   1752 	case nir_instr_type_intrinsic:
   1753 		emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
   1754 		break;
   1755 	case nir_instr_type_load_const:
   1756 		emit_load_const(ctx, nir_instr_as_load_const(instr));
   1757 		break;
   1758 	case nir_instr_type_ssa_undef:
   1759 		emit_undef(ctx, nir_instr_as_ssa_undef(instr));
   1760 		break;
   1761 	case nir_instr_type_tex: {
   1762 		nir_tex_instr *tex = nir_instr_as_tex(instr);
   1763 		/* couple tex instructions get special-cased:
   1764 		 */
   1765 		switch (tex->op) {
   1766 		case nir_texop_txs:
   1767 			emit_tex_txs(ctx, tex);
   1768 			break;
   1769 		case nir_texop_query_levels:
   1770 			emit_tex_query_levels(ctx, tex);
   1771 			break;
   1772 		default:
   1773 			emit_tex(ctx, tex);
   1774 			break;
   1775 		}
   1776 		break;
   1777 	}
   1778 	case nir_instr_type_phi:
   1779 		emit_phi(ctx, nir_instr_as_phi(instr));
   1780 		break;
   1781 	case nir_instr_type_jump:
   1782 		emit_jump(ctx, nir_instr_as_jump(instr));
   1783 		break;
   1784 	case nir_instr_type_call:
   1785 	case nir_instr_type_parallel_copy:
   1786 		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
   1787 		break;
   1788 	}
   1789 }
   1790 
   1791 static struct ir3_block *
   1792 get_block(struct ir3_compile *ctx, nir_block *nblock)
   1793 {
   1794 	struct ir3_block *block;
   1795 	struct hash_entry *entry;
   1796 	entry = _mesa_hash_table_search(ctx->block_ht, nblock);
   1797 	if (entry)
   1798 		return entry->data;
   1799 
   1800 	block = ir3_block_create(ctx->ir);
   1801 	block->nblock = nblock;
   1802 	_mesa_hash_table_insert(ctx->block_ht, nblock, block);
   1803 
   1804 	return block;
   1805 }
   1806 
   1807 static void
   1808 emit_block(struct ir3_compile *ctx, nir_block *nblock)
   1809 {
   1810 	struct ir3_block *block = get_block(ctx, nblock);
   1811 
   1812 	for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
   1813 		if (nblock->successors[i]) {
   1814 			block->successors[i] =
   1815 				get_block(ctx, nblock->successors[i]);
   1816 		}
   1817 	}
   1818 
   1819 	ctx->block = block;
   1820 	list_addtail(&block->node, &ctx->ir->block_list);
   1821 
   1822 	/* re-emit addr register in each block if needed: */
   1823 	_mesa_hash_table_destroy(ctx->addr_ht, NULL);
   1824 	ctx->addr_ht = NULL;
   1825 
   1826 	nir_foreach_instr(instr, nblock) {
   1827 		emit_instr(ctx, instr);
   1828 		if (ctx->error)
   1829 			return;
   1830 	}
   1831 }
   1832 
   1833 static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list);
   1834 
   1835 static void
   1836 emit_if(struct ir3_compile *ctx, nir_if *nif)
   1837 {
   1838 	struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
   1839 
   1840 	ctx->block->condition =
   1841 		get_predicate(ctx, ir3_b2n(condition->block, condition));
   1842 
   1843 	emit_cf_list(ctx, &nif->then_list);
   1844 	emit_cf_list(ctx, &nif->else_list);
   1845 }
   1846 
   1847 static void
   1848 emit_loop(struct ir3_compile *ctx, nir_loop *nloop)
   1849 {
   1850 	emit_cf_list(ctx, &nloop->body);
   1851 }
   1852 
   1853 static void
   1854 emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
   1855 {
   1856 	foreach_list_typed(nir_cf_node, node, node, list) {
   1857 		switch (node->type) {
   1858 		case nir_cf_node_block:
   1859 			emit_block(ctx, nir_cf_node_as_block(node));
   1860 			break;
   1861 		case nir_cf_node_if:
   1862 			emit_if(ctx, nir_cf_node_as_if(node));
   1863 			break;
   1864 		case nir_cf_node_loop:
   1865 			emit_loop(ctx, nir_cf_node_as_loop(node));
   1866 			break;
   1867 		case nir_cf_node_function:
   1868 			compile_error(ctx, "TODO\n");
   1869 			break;
   1870 		}
   1871 	}
   1872 }
   1873 
   1874 /* emit stream-out code.  At this point, the current block is the original
   1875  * (nir) end block, and nir ensures that all flow control paths terminate
   1876  * into the end block.  We re-purpose the original end block to generate
   1877  * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
   1878  * block holding stream-out write instructions, followed by the new end
   1879  * block:
   1880  *
   1881  *   blockOrigEnd {
   1882  *      p0.x = (vtxcnt < maxvtxcnt)
   1883  *      // succs: blockStreamOut, blockNewEnd
   1884  *   }
   1885  *   blockStreamOut {
   1886  *      ... stream-out instructions ...
   1887  *      // succs: blockNewEnd
   1888  *   }
   1889  *   blockNewEnd {
   1890  *   }
   1891  */
   1892 static void
   1893 emit_stream_out(struct ir3_compile *ctx)
   1894 {
   1895 	struct ir3_shader_variant *v = ctx->so;
   1896 	struct ir3 *ir = ctx->ir;
   1897 	struct pipe_stream_output_info *strmout =
   1898 			&ctx->so->shader->stream_output;
   1899 	struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
   1900 	struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
   1901 	struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS];
   1902 
   1903 	/* create vtxcnt input in input block at top of shader,
   1904 	 * so that it is seen as live over the entire duration
   1905 	 * of the shader:
   1906 	 */
   1907 	vtxcnt = create_input(ctx->in_block, 0);
   1908 	add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt);
   1909 
   1910 	maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
   1911 
   1912 	/* at this point, we are at the original 'end' block,
   1913 	 * re-purpose this block to stream-out condition, then
   1914 	 * append stream-out block and new-end block
   1915 	 */
   1916 	orig_end_block = ctx->block;
   1917 
   1918 	stream_out_block = ir3_block_create(ir);
   1919 	list_addtail(&stream_out_block->node, &ir->block_list);
   1920 
   1921 	new_end_block = ir3_block_create(ir);
   1922 	list_addtail(&new_end_block->node, &ir->block_list);
   1923 
   1924 	orig_end_block->successors[0] = stream_out_block;
   1925 	orig_end_block->successors[1] = new_end_block;
   1926 	stream_out_block->successors[0] = new_end_block;
   1927 
   1928 	/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
   1929 	cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
   1930 	cond->regs[0]->num = regid(REG_P0, 0);
   1931 	cond->cat2.condition = IR3_COND_LT;
   1932 
   1933 	/* condition goes on previous block to the conditional,
   1934 	 * since it is used to pick which of the two successor
   1935 	 * paths to take:
   1936 	 */
   1937 	orig_end_block->condition = cond;
   1938 
   1939 	/* switch to stream_out_block to generate the stream-out
   1940 	 * instructions:
   1941 	 */
   1942 	ctx->block = stream_out_block;
   1943 
   1944 	/* Calculate base addresses based on vtxcnt.  Instructions
   1945 	 * generated for bases not used in following loop will be
   1946 	 * stripped out in the backend.
   1947 	 */
   1948 	for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
   1949 		unsigned stride = strmout->stride[i];
   1950 		struct ir3_instruction *base, *off;
   1951 
   1952 		base = create_uniform(ctx, regid(v->constbase.tfbo, i));
   1953 
   1954 		/* 24-bit should be enough: */
   1955 		off = ir3_MUL_U(ctx->block, vtxcnt, 0,
   1956 				create_immed(ctx->block, stride * 4), 0);
   1957 
   1958 		bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
   1959 	}
   1960 
   1961 	/* Generate the per-output store instructions: */
   1962 	for (unsigned i = 0; i < strmout->num_outputs; i++) {
   1963 		for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
   1964 			unsigned c = j + strmout->output[i].start_component;
   1965 			struct ir3_instruction *base, *out, *stg;
   1966 
   1967 			base = bases[strmout->output[i].output_buffer];
   1968 			out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
   1969 
   1970 			stg = ir3_STG(ctx->block, base, 0, out, 0,
   1971 					create_immed(ctx->block, 1), 0);
   1972 			stg->cat6.type = TYPE_U32;
   1973 			stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
   1974 
   1975 			array_insert(ctx->ir->keeps, stg);
   1976 		}
   1977 	}
   1978 
   1979 	/* and finally switch to the new_end_block: */
   1980 	ctx->block = new_end_block;
   1981 }
   1982 
   1983 static void
   1984 emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
   1985 {
   1986 	nir_metadata_require(impl, nir_metadata_block_index);
   1987 
   1988 	emit_cf_list(ctx, &impl->body);
   1989 	emit_block(ctx, impl->end_block);
   1990 
   1991 	/* at this point, we should have a single empty block,
   1992 	 * into which we emit the 'end' instruction.
   1993 	 */
   1994 	compile_assert(ctx, list_empty(&ctx->block->instr_list));
   1995 
   1996 	/* If stream-out (aka transform-feedback) enabled, emit the
   1997 	 * stream-out instructions, followed by a new empty block (into
   1998 	 * which the 'end' instruction lands).
   1999 	 *
   2000 	 * NOTE: it is done in this order, rather than inserting before
   2001 	 * we emit end_block, because NIR guarantees that all blocks
   2002 	 * flow into end_block, and that end_block has no successors.
   2003 	 * So by re-purposing end_block as the first block of stream-
   2004 	 * out, we guarantee that all exit paths flow into the stream-
   2005 	 * out instructions.
   2006 	 */
   2007 	if ((ctx->compiler->gpu_id < 500) &&
   2008 			(ctx->so->shader->stream_output.num_outputs > 0) &&
   2009 			!ctx->so->key.binning_pass) {
   2010 		debug_assert(ctx->so->type == SHADER_VERTEX);
   2011 		emit_stream_out(ctx);
   2012 	}
   2013 
   2014 	ir3_END(ctx->block);
   2015 }
   2016 
   2017 static void
   2018 setup_input(struct ir3_compile *ctx, nir_variable *in)
   2019 {
   2020 	struct ir3_shader_variant *so = ctx->so;
   2021 	unsigned array_len = MAX2(glsl_get_length(in->type), 1);
   2022 	unsigned ncomp = glsl_get_components(in->type);
   2023 	unsigned n = in->data.driver_location;
   2024 	unsigned slot = in->data.location;
   2025 
   2026 	DBG("; in: slot=%u, len=%ux%u, drvloc=%u",
   2027 			slot, array_len, ncomp, n);
   2028 
   2029 	/* let's pretend things other than vec4 don't exist: */
   2030 	ncomp = MAX2(ncomp, 4);
   2031 	compile_assert(ctx, ncomp == 4);
   2032 
   2033 	so->inputs[n].slot = slot;
   2034 	so->inputs[n].compmask = (1 << ncomp) - 1;
   2035 	so->inputs_count = MAX2(so->inputs_count, n + 1);
   2036 	so->inputs[n].interpolate = in->data.interpolation;
   2037 
   2038 	if (ctx->so->type == SHADER_FRAGMENT) {
   2039 		for (int i = 0; i < ncomp; i++) {
   2040 			struct ir3_instruction *instr = NULL;
   2041 			unsigned idx = (n * 4) + i;
   2042 
   2043 			if (slot == VARYING_SLOT_POS) {
   2044 				so->inputs[n].bary = false;
   2045 				so->frag_coord = true;
   2046 				instr = create_frag_coord(ctx, i);
   2047 			} else if (slot == VARYING_SLOT_PNTC) {
   2048 				/* see for example st_get_generic_varying_index().. this is
   2049 				 * maybe a bit mesa/st specific.  But we need things to line
   2050 				 * up for this in fdN_program:
   2051 				 *    unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
   2052 				 *    if (emit->sprite_coord_enable & texmask) {
   2053 				 *       ...
   2054 				 *    }
   2055 				 */
   2056 				so->inputs[n].slot = VARYING_SLOT_VAR8;
   2057 				so->inputs[n].bary = true;
   2058 				instr = create_frag_input(ctx, false);
   2059 			} else {
   2060 				bool use_ldlv = false;
   2061 
   2062 				/* detect the special case for front/back colors where
   2063 				 * we need to do flat vs smooth shading depending on
   2064 				 * rast state:
   2065 				 */
   2066 				if (in->data.interpolation == INTERP_MODE_NONE) {
   2067 					switch (slot) {
   2068 					case VARYING_SLOT_COL0:
   2069 					case VARYING_SLOT_COL1:
   2070 					case VARYING_SLOT_BFC0:
   2071 					case VARYING_SLOT_BFC1:
   2072 						so->inputs[n].rasterflat = true;
   2073 						break;
   2074 					default:
   2075 						break;
   2076 					}
   2077 				}
   2078 
   2079 				if (ctx->flat_bypass) {
   2080 					if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) ||
   2081 							(so->inputs[n].rasterflat && ctx->so->key.rasterflat))
   2082 						use_ldlv = true;
   2083 				}
   2084 
   2085 				so->inputs[n].bary = true;
   2086 
   2087 				instr = create_frag_input(ctx, use_ldlv);
   2088 			}
   2089 
   2090 			compile_assert(ctx, idx < ctx->ir->ninputs);
   2091 
   2092 			ctx->ir->inputs[idx] = instr;
   2093 		}
   2094 	} else if (ctx->so->type == SHADER_VERTEX) {
   2095 		for (int i = 0; i < ncomp; i++) {
   2096 			unsigned idx = (n * 4) + i;
   2097 			compile_assert(ctx, idx < ctx->ir->ninputs);
   2098 			ctx->ir->inputs[idx] = create_input(ctx->block, idx);
   2099 		}
   2100 	} else {
   2101 		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
   2102 	}
   2103 
   2104 	if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) {
   2105 		so->total_in += ncomp;
   2106 	}
   2107 }
   2108 
   2109 static void
   2110 setup_output(struct ir3_compile *ctx, nir_variable *out)
   2111 {
   2112 	struct ir3_shader_variant *so = ctx->so;
   2113 	unsigned array_len = MAX2(glsl_get_length(out->type), 1);
   2114 	unsigned ncomp = glsl_get_components(out->type);
   2115 	unsigned n = out->data.driver_location;
   2116 	unsigned slot = out->data.location;
   2117 	unsigned comp = 0;
   2118 
   2119 	DBG("; out: slot=%u, len=%ux%u, drvloc=%u",
   2120 			slot, array_len, ncomp, n);
   2121 
   2122 	/* let's pretend things other than vec4 don't exist: */
   2123 	ncomp = MAX2(ncomp, 4);
   2124 	compile_assert(ctx, ncomp == 4);
   2125 
   2126 	if (ctx->so->type == SHADER_FRAGMENT) {
   2127 		switch (slot) {
   2128 		case FRAG_RESULT_DEPTH:
   2129 			comp = 2;  /* tgsi will write to .z component */
   2130 			so->writes_pos = true;
   2131 			break;
   2132 		case FRAG_RESULT_COLOR:
   2133 			so->color0_mrt = 1;
   2134 			break;
   2135 		default:
   2136 			if (slot >= FRAG_RESULT_DATA0)
   2137 				break;
   2138 			compile_error(ctx, "unknown FS output name: %s\n",
   2139 					gl_frag_result_name(slot));
   2140 		}
   2141 	} else if (ctx->so->type == SHADER_VERTEX) {
   2142 		switch (slot) {
   2143 		case VARYING_SLOT_POS:
   2144 			so->writes_pos = true;
   2145 			break;
   2146 		case VARYING_SLOT_PSIZ:
   2147 			so->writes_psize = true;
   2148 			break;
   2149 		case VARYING_SLOT_COL0:
   2150 		case VARYING_SLOT_COL1:
   2151 		case VARYING_SLOT_BFC0:
   2152 		case VARYING_SLOT_BFC1:
   2153 		case VARYING_SLOT_FOGC:
   2154 		case VARYING_SLOT_CLIP_DIST0:
   2155 		case VARYING_SLOT_CLIP_DIST1:
   2156 		case VARYING_SLOT_CLIP_VERTEX:
   2157 			break;
   2158 		default:
   2159 			if (slot >= VARYING_SLOT_VAR0)
   2160 				break;
   2161 			if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7))
   2162 				break;
   2163 			compile_error(ctx, "unknown VS output name: %s\n",
   2164 					gl_varying_slot_name(slot));
   2165 		}
   2166 	} else {
   2167 		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
   2168 	}
   2169 
   2170 	compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
   2171 
   2172 	so->outputs[n].slot = slot;
   2173 	so->outputs[n].regid = regid(n, comp);
   2174 	so->outputs_count = MAX2(so->outputs_count, n + 1);
   2175 
   2176 	for (int i = 0; i < ncomp; i++) {
   2177 		unsigned idx = (n * 4) + i;
   2178 		compile_assert(ctx, idx < ctx->ir->noutputs);
   2179 		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
   2180 	}
   2181 }
   2182 
   2183 static int
   2184 max_drvloc(struct exec_list *vars)
   2185 {
   2186 	int drvloc = -1;
   2187 	nir_foreach_variable(var, vars) {
   2188 		drvloc = MAX2(drvloc, (int)var->data.driver_location);
   2189 	}
   2190 	return drvloc;
   2191 }
   2192 
   2193 static void
   2194 emit_instructions(struct ir3_compile *ctx)
   2195 {
   2196 	unsigned ninputs, noutputs;
   2197 	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
   2198 
   2199 	ninputs  = (max_drvloc(&ctx->s->inputs) + 1) * 4;
   2200 	noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4;
   2201 
   2202 	/* or vtx shaders, we need to leave room for sysvals:
   2203 	 */
   2204 	if (ctx->so->type == SHADER_VERTEX) {
   2205 		ninputs += 16;
   2206 	}
   2207 
   2208 	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
   2209 
   2210 	/* Create inputs in first block: */
   2211 	ctx->block = get_block(ctx, nir_start_block(fxn));
   2212 	ctx->in_block = ctx->block;
   2213 	list_addtail(&ctx->block->node, &ctx->ir->block_list);
   2214 
   2215 	if (ctx->so->type == SHADER_VERTEX) {
   2216 		ctx->ir->ninputs -= 16;
   2217 	}
   2218 
   2219 	/* for fragment shader, we have a single input register (usually
   2220 	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
   2221 	 */
   2222 	if (ctx->so->type == SHADER_FRAGMENT) {
   2223 		// TODO maybe a helper for fi since we need it a few places..
   2224 		struct ir3_instruction *instr;
   2225 		instr = ir3_instr_create(ctx->block, OPC_META_FI);
   2226 		ir3_reg_create(instr, 0, 0);
   2227 		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
   2228 		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
   2229 		ctx->frag_pos = instr;
   2230 	}
   2231 
   2232 	/* Setup inputs: */
   2233 	nir_foreach_variable(var, &ctx->s->inputs) {
   2234 		setup_input(ctx, var);
   2235 	}
   2236 
   2237 	/* Setup outputs: */
   2238 	nir_foreach_variable(var, &ctx->s->outputs) {
   2239 		setup_output(ctx, var);
   2240 	}
   2241 
   2242 	/* Setup global variables (which should only be arrays): */
   2243 	nir_foreach_variable(var, &ctx->s->globals) {
   2244 		declare_var(ctx, var);
   2245 	}
   2246 
   2247 	/* Setup local variables (which should only be arrays): */
   2248 	/* NOTE: need to do something more clever when we support >1 fxn */
   2249 	nir_foreach_variable(var, &fxn->locals) {
   2250 		declare_var(ctx, var);
   2251 	}
   2252 
   2253 	/* And emit the body: */
   2254 	ctx->impl = fxn;
   2255 	emit_function(ctx, fxn);
   2256 
   2257 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
   2258 		resolve_phis(ctx, block);
   2259 	}
   2260 }
   2261 
   2262 /* from NIR perspective, we actually have inputs.  But most of the "inputs"
   2263  * for a fragment shader are just bary.f instructions.  The *actual* inputs
   2264  * from the hw perspective are the frag_pos and optionally frag_coord and
   2265  * frag_face.
   2266  */
   2267 static void
   2268 fixup_frag_inputs(struct ir3_compile *ctx)
   2269 {
   2270 	struct ir3_shader_variant *so = ctx->so;
   2271 	struct ir3 *ir = ctx->ir;
   2272 	struct ir3_instruction **inputs;
   2273 	struct ir3_instruction *instr;
   2274 	int n, regid = 0;
   2275 
   2276 	ir->ninputs = 0;
   2277 
   2278 	n  = 4;  /* always have frag_pos */
   2279 	n += COND(so->frag_face, 4);
   2280 	n += COND(so->frag_coord, 4);
   2281 
   2282 	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
   2283 
   2284 	if (so->frag_face) {
   2285 		/* this ultimately gets assigned to hr0.x so doesn't conflict
   2286 		 * with frag_coord/frag_pos..
   2287 		 */
   2288 		inputs[ir->ninputs++] = ctx->frag_face;
   2289 		ctx->frag_face->regs[0]->num = 0;
   2290 
   2291 		/* remaining channels not used, but let's avoid confusing
   2292 		 * other parts that expect inputs to come in groups of vec4
   2293 		 */
   2294 		inputs[ir->ninputs++] = NULL;
   2295 		inputs[ir->ninputs++] = NULL;
   2296 		inputs[ir->ninputs++] = NULL;
   2297 	}
   2298 
   2299 	/* since we don't know where to set the regid for frag_coord,
   2300 	 * we have to use r0.x for it.  But we don't want to *always*
   2301 	 * use r1.x for frag_pos as that could increase the register
   2302 	 * footprint on simple shaders:
   2303 	 */
   2304 	if (so->frag_coord) {
   2305 		ctx->frag_coord[0]->regs[0]->num = regid++;
   2306 		ctx->frag_coord[1]->regs[0]->num = regid++;
   2307 		ctx->frag_coord[2]->regs[0]->num = regid++;
   2308 		ctx->frag_coord[3]->regs[0]->num = regid++;
   2309 
   2310 		inputs[ir->ninputs++] = ctx->frag_coord[0];
   2311 		inputs[ir->ninputs++] = ctx->frag_coord[1];
   2312 		inputs[ir->ninputs++] = ctx->frag_coord[2];
   2313 		inputs[ir->ninputs++] = ctx->frag_coord[3];
   2314 	}
   2315 
   2316 	/* we always have frag_pos: */
   2317 	so->pos_regid = regid;
   2318 
   2319 	/* r0.x */
   2320 	instr = create_input(ctx->in_block, ir->ninputs);
   2321 	instr->regs[0]->num = regid++;
   2322 	inputs[ir->ninputs++] = instr;
   2323 	ctx->frag_pos->regs[1]->instr = instr;
   2324 
   2325 	/* r0.y */
   2326 	instr = create_input(ctx->in_block, ir->ninputs);
   2327 	instr->regs[0]->num = regid++;
   2328 	inputs[ir->ninputs++] = instr;
   2329 	ctx->frag_pos->regs[2]->instr = instr;
   2330 
   2331 	ir->inputs = inputs;
   2332 }
   2333 
   2334 /* Fixup tex sampler state for astc/srgb workaround instructions.  We
   2335  * need to assign the tex state indexes for these after we know the
   2336  * max tex index.
   2337  */
   2338 static void
   2339 fixup_astc_srgb(struct ir3_compile *ctx)
   2340 {
   2341 	struct ir3_shader_variant *so = ctx->so;
   2342 	/* indexed by original tex idx, value is newly assigned alpha sampler
   2343 	 * state tex idx.  Zero is invalid since there is at least one sampler
   2344 	 * if we get here.
   2345 	 */
   2346 	unsigned alt_tex_state[16] = {0};
   2347 	unsigned tex_idx = ctx->max_texture_index + 1;
   2348 	unsigned idx = 0;
   2349 
   2350 	so->astc_srgb.base = tex_idx;
   2351 
   2352 	for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) {
   2353 		struct ir3_instruction *sam = ctx->ir->astc_srgb[i];
   2354 
   2355 		compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state));
   2356 
   2357 		if (alt_tex_state[sam->cat5.tex] == 0) {
   2358 			/* assign new alternate/alpha tex state slot: */
   2359 			alt_tex_state[sam->cat5.tex] = tex_idx++;
   2360 			so->astc_srgb.orig_idx[idx++] = sam->cat5.tex;
   2361 			so->astc_srgb.count++;
   2362 		}
   2363 
   2364 		sam->cat5.tex = alt_tex_state[sam->cat5.tex];
   2365 	}
   2366 }
   2367 
   2368 int
   2369 ir3_compile_shader_nir(struct ir3_compiler *compiler,
   2370 		struct ir3_shader_variant *so)
   2371 {
   2372 	struct ir3_compile *ctx;
   2373 	struct ir3 *ir;
   2374 	struct ir3_instruction **inputs;
   2375 	unsigned i, j, actual_in, inloc;
   2376 	int ret = 0, max_bary;
   2377 
   2378 	assert(!so->ir);
   2379 
   2380 	ctx = compile_init(compiler, so);
   2381 	if (!ctx) {
   2382 		DBG("INIT failed!");
   2383 		ret = -1;
   2384 		goto out;
   2385 	}
   2386 
   2387 	emit_instructions(ctx);
   2388 
   2389 	if (ctx->error) {
   2390 		DBG("EMIT failed!");
   2391 		ret = -1;
   2392 		goto out;
   2393 	}
   2394 
   2395 	ir = so->ir = ctx->ir;
   2396 
   2397 	/* keep track of the inputs from TGSI perspective.. */
   2398 	inputs = ir->inputs;
   2399 
   2400 	/* but fixup actual inputs for frag shader: */
   2401 	if (so->type == SHADER_FRAGMENT)
   2402 		fixup_frag_inputs(ctx);
   2403 
   2404 	/* at this point, for binning pass, throw away unneeded outputs: */
   2405 	if (so->key.binning_pass) {
   2406 		for (i = 0, j = 0; i < so->outputs_count; i++) {
   2407 			unsigned slot = so->outputs[i].slot;
   2408 
   2409 			/* throw away everything but first position/psize */
   2410 			if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) {
   2411 				if (i != j) {
   2412 					so->outputs[j] = so->outputs[i];
   2413 					ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
   2414 					ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
   2415 					ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
   2416 					ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
   2417 				}
   2418 				j++;
   2419 			}
   2420 		}
   2421 		so->outputs_count = j;
   2422 		ir->noutputs = j * 4;
   2423 	}
   2424 
   2425 	/* if we want half-precision outputs, mark the output registers
   2426 	 * as half:
   2427 	 */
   2428 	if (so->key.half_precision) {
   2429 		for (i = 0; i < ir->noutputs; i++) {
   2430 			struct ir3_instruction *out = ir->outputs[i];
   2431 			if (!out)
   2432 				continue;
   2433 			out->regs[0]->flags |= IR3_REG_HALF;
   2434 			/* output could be a fanout (ie. texture fetch output)
   2435 			 * in which case we need to propagate the half-reg flag
   2436 			 * up to the definer so that RA sees it:
   2437 			 */
   2438 			if (out->opc == OPC_META_FO) {
   2439 				out = out->regs[1]->instr;
   2440 				out->regs[0]->flags |= IR3_REG_HALF;
   2441 			}
   2442 
   2443 			if (out->opc == OPC_MOV) {
   2444 				out->cat1.dst_type = half_type(out->cat1.dst_type);
   2445 			}
   2446 		}
   2447 	}
   2448 
   2449 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
   2450 		printf("BEFORE CP:\n");
   2451 		ir3_print(ir);
   2452 	}
   2453 
   2454 	ir3_cp(ir, so);
   2455 
   2456 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
   2457 		printf("BEFORE GROUPING:\n");
   2458 		ir3_print(ir);
   2459 	}
   2460 
   2461 	/* Group left/right neighbors, inserting mov's where needed to
   2462 	 * solve conflicts:
   2463 	 */
   2464 	ir3_group(ir);
   2465 
   2466 	ir3_depth(ir);
   2467 
   2468 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
   2469 		printf("AFTER DEPTH:\n");
   2470 		ir3_print(ir);
   2471 	}
   2472 
   2473 	ret = ir3_sched(ir);
   2474 	if (ret) {
   2475 		DBG("SCHED failed!");
   2476 		goto out;
   2477 	}
   2478 
   2479 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
   2480 		printf("AFTER SCHED:\n");
   2481 		ir3_print(ir);
   2482 	}
   2483 
   2484 	ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
   2485 	if (ret) {
   2486 		DBG("RA failed!");
   2487 		goto out;
   2488 	}
   2489 
   2490 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
   2491 		printf("AFTER RA:\n");
   2492 		ir3_print(ir);
   2493 	}
   2494 
   2495 	/* fixup input/outputs: */
   2496 	for (i = 0; i < so->outputs_count; i++) {
   2497 		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
   2498 	}
   2499 
   2500 	/* Note that some or all channels of an input may be unused: */
   2501 	actual_in = 0;
   2502 	inloc = 0;
   2503 	for (i = 0; i < so->inputs_count; i++) {
   2504 		unsigned j, regid = ~0, compmask = 0, maxcomp = 0;
   2505 		so->inputs[i].ncomp = 0;
   2506 		so->inputs[i].inloc = inloc;
   2507 		for (j = 0; j < 4; j++) {
   2508 			struct ir3_instruction *in = inputs[(i*4) + j];
   2509 			if (in && !(in->flags & IR3_INSTR_UNUSED)) {
   2510 				compmask |= (1 << j);
   2511 				regid = in->regs[0]->num - j;
   2512 				actual_in++;
   2513 				so->inputs[i].ncomp++;
   2514 				if ((so->type == SHADER_FRAGMENT) && so->inputs[i].bary) {
   2515 					/* assign inloc: */
   2516 					assert(in->regs[1]->flags & IR3_REG_IMMED);
   2517 					in->regs[1]->iim_val = inloc + j;
   2518 					maxcomp = j + 1;
   2519 				}
   2520 			}
   2521 		}
   2522 		if ((so->type == SHADER_FRAGMENT) && compmask && so->inputs[i].bary) {
   2523 			so->varying_in++;
   2524 			so->inputs[i].compmask = (1 << maxcomp) - 1;
   2525 			inloc += maxcomp;
   2526 		} else {
   2527 			so->inputs[i].compmask = compmask;
   2528 		}
   2529 		so->inputs[i].regid = regid;
   2530 	}
   2531 
   2532 	if (ctx->astc_srgb)
   2533 		fixup_astc_srgb(ctx);
   2534 
   2535 	/* We need to do legalize after (for frag shader's) the "bary.f"
   2536 	 * offsets (inloc) have been assigned.
   2537 	 */
   2538 	ir3_legalize(ir, &so->has_samp, &max_bary);
   2539 
   2540 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
   2541 		printf("AFTER LEGALIZE:\n");
   2542 		ir3_print(ir);
   2543 	}
   2544 
   2545 	/* Note that actual_in counts inputs that are not bary.f'd for FS: */
   2546 	if (so->type == SHADER_VERTEX)
   2547 		so->total_in = actual_in;
   2548 	else
   2549 		so->total_in = max_bary + 1;
   2550 
   2551 out:
   2552 	if (ret) {
   2553 		if (so->ir)
   2554 			ir3_destroy(so->ir);
   2555 		so->ir = NULL;
   2556 	}
   2557 	compile_free(ctx);
   2558 
   2559 	return ret;
   2560 }
   2561