Home | History | Annotate | Download | only in ir3
      1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
      2 
      3 /*
      4  * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org>
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     23  * SOFTWARE.
     24  *
     25  * Authors:
     26  *    Rob Clark <robclark (at) freedesktop.org>
     27  */
     28 
     29 #include "util/u_math.h"
     30 #include "util/register_allocate.h"
     31 #include "util/ralloc.h"
     32 #include "util/bitset.h"
     33 
     34 #include "freedreno_util.h"
     35 
     36 #include "ir3.h"
     37 #include "ir3_compiler.h"
     38 
     39 /*
     40  * Register Assignment:
     41  *
     42  * Uses the register_allocate util, which implements graph coloring
     43  * algo with interference classes.  To handle the cases where we need
     44  * consecutive registers (for example, texture sample instructions),
     45  * we model these as larger (double/quad/etc) registers which conflict
     46  * with the corresponding registers in other classes.
     47  *
     48  * Additionally we create additional classes for half-regs, which
     49  * do not conflict with the full-reg classes.  We do need at least
     50  * sizes 1-4 (to deal w/ texture sample instructions output to half-
     51  * reg).  At the moment we don't create the higher order half-reg
     52  * classes as half-reg frequently does not have enough precision
     53  * for texture coords at higher resolutions.
     54  *
     55  * There are some additional cases that we need to handle specially,
     56  * as the graph coloring algo doesn't understand "partial writes".
     57  * For example, a sequence like:
     58  *
     59  *   add r0.z, ...
     60  *   sam (f32)(xy)r0.x, ...
     61  *   ...
     62  *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
     63  *
     64  * In this scenario, we treat r0.xyz as class size 3, which is written
     65  * (from a use/def perspective) at the 'add' instruction and ignore the
     66  * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
     67  * defining instruction, as it is the first to partially write r0.xyz.
     68  *
     69  * Note i965 has a similar scenario, which they solve with a virtual
     70  * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
     71  * register assignment.  But for us that is horrible from a scheduling
     72  * standpoint.  Instead what we do is use idea of 'definer' instruction.
     73  * Ie. the first instruction (lowest ip) to write to the variable is the
     74  * one we consider from use/def perspective when building interference
     75  * graph.  (Other instructions which write other variable components
     76  * just define the variable some more.)
     77  *
     78  * Arrays of arbitrary size are handled via pre-coloring a consecutive
     79  * sequence of registers.  Additional scalar (single component) reg
     80  * names are allocated starting at ctx->class_base[total_class_count]
     81  * (see arr->base), which are pre-colored.  In the use/def graph direct
     82  * access is treated as a single element use/def, and indirect access
     83  * is treated as use or def of all array elements.  (Only the first
     84  * def is tracked, in case of multiple indirect writes, etc.)
     85  */
     86 
     87 static const unsigned class_sizes[] = {
     88 	1, 2, 3, 4,
     89 	4 + 4, /* txd + 1d/2d */
     90 	4 + 6, /* txd + 3d */
     91 };
     92 #define class_count ARRAY_SIZE(class_sizes)
     93 
     94 static const unsigned half_class_sizes[] = {
     95 	1, 2, 3, 4,
     96 };
     97 #define half_class_count  ARRAY_SIZE(half_class_sizes)
     98 #define total_class_count (class_count + half_class_count)
     99 
    100 /* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
    101 #define NUM_REGS             (4 * 48)
    102 /* Number of virtual regs in a given class: */
    103 #define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
    104 #define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
    105 
    106 /* register-set, created one time, used for all shaders: */
    107 struct ir3_ra_reg_set {
    108 	struct ra_regs *regs;
    109 	unsigned int classes[class_count];
    110 	unsigned int half_classes[half_class_count];
    111 	/* maps flat virtual register space to base gpr: */
    112 	uint16_t *ra_reg_to_gpr;
    113 	/* maps cls,gpr to flat virtual register space: */
    114 	uint16_t **gpr_to_ra_reg;
    115 };
    116 
    117 /* One-time setup of RA register-set, which describes all the possible
    118  * "virtual" registers and their interferences.  Ie. double register
    119  * occupies (and conflicts with) two single registers, and so forth.
    120  * Since registers do not need to be aligned to their class size, they
    121  * can conflict with other registers in the same class too.  Ie:
    122  *
    123  *    Single (base) |  Double
    124  *    --------------+---------------
    125  *       R0         |  D0
    126  *       R1         |  D0 D1
    127  *       R2         |     D1 D2
    128  *       R3         |        D2
    129  *           .. and so on..
    130  *
    131  * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
    132  * really just four scalar registers.  Don't let that confuse you.)
    133  */
    134 struct ir3_ra_reg_set *
    135 ir3_ra_alloc_reg_set(void *memctx)
    136 {
    137 	struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
    138 	unsigned ra_reg_count, reg, first_half_reg;
    139 	unsigned int **q_values;
    140 
    141 	/* calculate # of regs across all classes: */
    142 	ra_reg_count = 0;
    143 	for (unsigned i = 0; i < class_count; i++)
    144 		ra_reg_count += CLASS_REGS(i);
    145 	for (unsigned i = 0; i < half_class_count; i++)
    146 		ra_reg_count += HALF_CLASS_REGS(i);
    147 
    148 	/* allocate and populate q_values: */
    149 	q_values = ralloc_array(set, unsigned *, total_class_count);
    150 	for (unsigned i = 0; i < class_count; i++) {
    151 		q_values[i] = rzalloc_array(q_values, unsigned, total_class_count);
    152 
    153 		/* From register_allocate.c:
    154 		 *
    155 		 * q(B,C) (indexed by C, B is this register class) in
    156 		 * Runeson/Nystrm paper.  This is "how many registers of B could
    157 		 * the worst choice register from C conflict with".
    158 		 *
    159 		 * If we just let the register allocation algorithm compute these
    160 		 * values, is extremely expensive.  However, since all of our
    161 		 * registers are laid out, we can very easily compute them
    162 		 * ourselves.  View the register from C as fixed starting at GRF n
    163 		 * somewhere in the middle, and the register from B as sliding back
    164 		 * and forth.  Then the first register to conflict from B is the
    165 		 * one starting at n - class_size[B] + 1 and the last register to
    166 		 * conflict will start at n + class_size[B] - 1.  Therefore, the
    167 		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
    168 		 *
    169 		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
    170 		 * B | | | | | |n| --> | | | | | | |
    171 		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
    172 		 *             +-+-+-+-+-+
    173 		 * C           |n| | | | |
    174 		 *             +-+-+-+-+-+
    175 		 *
    176 		 * (Idea copied from brw_fs_reg_allocate.cpp)
    177 		 */
    178 		for (unsigned j = 0; j < class_count; j++)
    179 			q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
    180 	}
    181 
    182 	for (unsigned i = class_count; i < total_class_count; i++) {
    183 		q_values[i] = ralloc_array(q_values, unsigned, total_class_count);
    184 
    185 		/* see comment above: */
    186 		for (unsigned j = class_count; j < total_class_count; j++) {
    187 			q_values[i][j] = half_class_sizes[i - class_count] +
    188 					half_class_sizes[j - class_count] - 1;
    189 		}
    190 	}
    191 
    192 	/* allocate the reg-set.. */
    193 	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
    194 	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
    195 	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
    196 
    197 	/* .. and classes */
    198 	reg = 0;
    199 	for (unsigned i = 0; i < class_count; i++) {
    200 		set->classes[i] = ra_alloc_reg_class(set->regs);
    201 
    202 		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
    203 
    204 		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
    205 			ra_class_add_reg(set->regs, set->classes[i], reg);
    206 
    207 			set->ra_reg_to_gpr[reg] = j;
    208 			set->gpr_to_ra_reg[i][j] = reg;
    209 
    210 			for (unsigned br = j; br < j + class_sizes[i]; br++)
    211 				ra_add_transitive_reg_conflict(set->regs, br, reg);
    212 
    213 			reg++;
    214 		}
    215 	}
    216 
    217 	first_half_reg = reg;
    218 
    219 	for (unsigned i = 0; i < half_class_count; i++) {
    220 		set->half_classes[i] = ra_alloc_reg_class(set->regs);
    221 
    222 		set->gpr_to_ra_reg[class_count + i] =
    223 				ralloc_array(set, uint16_t, CLASS_REGS(i));
    224 
    225 		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
    226 			ra_class_add_reg(set->regs, set->half_classes[i], reg);
    227 
    228 			set->ra_reg_to_gpr[reg] = j;
    229 			set->gpr_to_ra_reg[class_count + i][j] = reg;
    230 
    231 			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
    232 				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
    233 
    234 			reg++;
    235 		}
    236 	}
    237 
    238 	ra_set_finalize(set->regs, q_values);
    239 
    240 	ralloc_free(q_values);
    241 
    242 	return set;
    243 }
    244 
    245 /* additional block-data (per-block) */
    246 struct ir3_ra_block_data {
    247 	BITSET_WORD *def;        /* variables defined before used in block */
    248 	BITSET_WORD *use;        /* variables used before defined in block */
    249 	BITSET_WORD *livein;     /* which defs reach entry point of block */
    250 	BITSET_WORD *liveout;    /* which defs reach exit point of block */
    251 };
    252 
    253 /* additional instruction-data (per-instruction) */
    254 struct ir3_ra_instr_data {
    255 	/* cached instruction 'definer' info: */
    256 	struct ir3_instruction *defn;
    257 	int off, sz, cls;
    258 };
    259 
    260 /* register-assign context, per-shader */
    261 struct ir3_ra_ctx {
    262 	struct ir3 *ir;
    263 	enum shader_t type;
    264 	bool frag_face;
    265 
    266 	struct ir3_ra_reg_set *set;
    267 	struct ra_graph *g;
    268 	unsigned alloc_count;
    269 	/* one per class, plus one slot for arrays: */
    270 	unsigned class_alloc_count[total_class_count + 1];
    271 	unsigned class_base[total_class_count + 1];
    272 	unsigned instr_cnt;
    273 	unsigned *def, *use;     /* def/use table */
    274 	struct ir3_ra_instr_data *instrd;
    275 };
    276 
    277 /* does it conflict? */
    278 static inline bool
    279 intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
    280 {
    281 	return !((a_start >= b_end) || (b_start >= a_end));
    282 }
    283 
    284 static bool
    285 is_half(struct ir3_instruction *instr)
    286 {
    287 	return !!(instr->regs[0]->flags & IR3_REG_HALF);
    288 }
    289 
    290 static int
    291 size_to_class(unsigned sz, bool half)
    292 {
    293 	if (half) {
    294 		for (unsigned i = 0; i < half_class_count; i++)
    295 			if (half_class_sizes[i] >= sz)
    296 				return i + class_count;
    297 	} else {
    298 		for (unsigned i = 0; i < class_count; i++)
    299 			if (class_sizes[i] >= sz)
    300 				return i;
    301 	}
    302 	debug_assert(0);
    303 	return -1;
    304 }
    305 
    306 static bool
    307 is_temp(struct ir3_register *reg)
    308 {
    309 	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
    310 		return false;
    311 	if ((reg->num == regid(REG_A0, 0)) ||
    312 			(reg->num == regid(REG_P0, 0)))
    313 		return false;
    314 	return true;
    315 }
    316 
    317 static bool
    318 writes_gpr(struct ir3_instruction *instr)
    319 {
    320 	if (is_store(instr))
    321 		return false;
    322 	/* is dest a normal temp register: */
    323 	return is_temp(instr->regs[0]);
    324 }
    325 
    326 static bool
    327 instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
    328 {
    329 	if (a->flags & IR3_INSTR_UNUSED)
    330 		return false;
    331 	return (a->ip < b->ip);
    332 }
    333 
    334 static struct ir3_instruction *
    335 get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
    336 		int *sz, int *off)
    337 {
    338 	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
    339 	struct ir3_instruction *d = NULL;
    340 
    341 	if (id->defn) {
    342 		*sz = id->sz;
    343 		*off = id->off;
    344 		return id->defn;
    345 	}
    346 
    347 	if (instr->opc == OPC_META_FI) {
    348 		/* What about the case where collect is subset of array, we
    349 		 * need to find the distance between where actual array starts
    350 		 * and fanin..  that probably doesn't happen currently.
    351 		 */
    352 		struct ir3_register *src;
    353 		int dsz, doff;
    354 
    355 		/* note: don't use foreach_ssa_src as this gets called once
    356 		 * while assigning regs (which clears SSA flag)
    357 		 */
    358 		foreach_src_n(src, n, instr) {
    359 			struct ir3_instruction *dd;
    360 			if (!src->instr)
    361 				continue;
    362 
    363 			dd = get_definer(ctx, src->instr, &dsz, &doff);
    364 
    365 			if ((!d) || instr_before(dd, d)) {
    366 				d = dd;
    367 				*sz = dsz;
    368 				*off = doff - n;
    369 			}
    370 		}
    371 
    372 	} else if (instr->cp.right || instr->cp.left) {
    373 		/* covers also the meta:fo case, which ends up w/ single
    374 		 * scalar instructions for each component:
    375 		 */
    376 		struct ir3_instruction *f = ir3_neighbor_first(instr);
    377 
    378 		/* by definition, the entire sequence forms one linked list
    379 		 * of single scalar register nodes (even if some of them may
    380 		 * be fanouts from a texture sample (for example) instr.  We
    381 		 * just need to walk the list finding the first element of
    382 		 * the group defined (lowest ip)
    383 		 */
    384 		int cnt = 0;
    385 
    386 		/* need to skip over unused in the group: */
    387 		while (f && (f->flags & IR3_INSTR_UNUSED)) {
    388 			f = f->cp.right;
    389 			cnt++;
    390 		}
    391 
    392 		while (f) {
    393 			if ((!d) || instr_before(f, d))
    394 				d = f;
    395 			if (f == instr)
    396 				*off = cnt;
    397 			f = f->cp.right;
    398 			cnt++;
    399 		}
    400 
    401 		*sz = cnt;
    402 
    403 	} else {
    404 		/* second case is looking directly at the instruction which
    405 		 * produces multiple values (eg, texture sample), rather
    406 		 * than the fanout nodes that point back to that instruction.
    407 		 * This isn't quite right, because it may be part of a larger
    408 		 * group, such as:
    409 		 *
    410 		 *     sam (f32)(xyzw)r0.x, ...
    411 		 *     add r1.x, ...
    412 		 *     add r1.y, ...
    413 		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
    414 		 *
    415 		 * need to come up with a better way to handle that case.
    416 		 */
    417 		if (instr->address) {
    418 			*sz = instr->regs[0]->size;
    419 		} else {
    420 			*sz = util_last_bit(instr->regs[0]->wrmask);
    421 		}
    422 		*off = 0;
    423 		d = instr;
    424 	}
    425 
    426 	if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
    427 		struct ir3_instruction *phi = d->regs[0]->instr;
    428 		struct ir3_instruction *dd;
    429 		int dsz, doff;
    430 
    431 		dd = get_definer(ctx, phi, &dsz, &doff);
    432 
    433 		*sz = MAX2(*sz, dsz);
    434 		*off = doff;
    435 
    436 		if (instr_before(dd, d)) {
    437 			d = dd;
    438 		}
    439 	}
    440 
    441 	if (d->opc == OPC_META_PHI) {
    442 		/* we have already inserted parallel-copies into
    443 		 * the phi, so we don't need to chase definers
    444 		 */
    445 		struct ir3_register *src;
    446 		struct ir3_instruction *dd = d;
    447 
    448 		/* note: don't use foreach_ssa_src as this gets called once
    449 		 * while assigning regs (which clears SSA flag)
    450 		 */
    451 		foreach_src(src, d) {
    452 			if (!src->instr)
    453 				continue;
    454 			if (instr_before(src->instr, dd))
    455 				dd = src->instr;
    456 		}
    457 
    458 		d = dd;
    459 	}
    460 
    461 	if (d->opc == OPC_META_FO) {
    462 		struct ir3_instruction *dd;
    463 		int dsz, doff;
    464 
    465 		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
    466 
    467 		/* by definition, should come before: */
    468 		debug_assert(instr_before(dd, d));
    469 
    470 		*sz = MAX2(*sz, dsz);
    471 
    472 		debug_assert(instr->opc == OPC_META_FO);
    473 		*off = MAX2(*off, instr->fo.off);
    474 
    475 		d = dd;
    476 	}
    477 
    478 	id->defn = d;
    479 	id->sz = *sz;
    480 	id->off = *off;
    481 
    482 	return d;
    483 }
    484 
    485 static void
    486 ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
    487 {
    488 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
    489 		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
    490 		if (instr->regs_count == 0)
    491 			continue;
    492 		/* couple special cases: */
    493 		if (writes_addr(instr) || writes_pred(instr)) {
    494 			id->cls = -1;
    495 		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
    496 			id->cls = total_class_count;
    497 			id->defn = instr;
    498 		} else {
    499 			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
    500 			id->cls = size_to_class(id->sz, is_half(id->defn));
    501 		}
    502 	}
    503 }
    504 
    505 /* give each instruction a name (and ip), and count up the # of names
    506  * of each class
    507  */
    508 static void
    509 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
    510 {
    511 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
    512 		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
    513 
    514 #ifdef DEBUG
    515 		instr->name = ~0;
    516 #endif
    517 
    518 		ctx->instr_cnt++;
    519 
    520 		if (instr->regs_count == 0)
    521 			continue;
    522 
    523 		if (!writes_gpr(instr))
    524 			continue;
    525 
    526 		if (id->defn != instr)
    527 			continue;
    528 
    529 		/* arrays which don't fit in one of the pre-defined class
    530 		 * sizes are pre-colored:
    531 		 */
    532 		if (id->cls >= 0) {
    533 			instr->name = ctx->class_alloc_count[id->cls]++;
    534 			ctx->alloc_count++;
    535 		}
    536 	}
    537 }
    538 
    539 static void
    540 ra_init(struct ir3_ra_ctx *ctx)
    541 {
    542 	unsigned n, base;
    543 
    544 	ir3_clear_mark(ctx->ir);
    545 	n = ir3_count_instructions(ctx->ir);
    546 
    547 	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
    548 
    549 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
    550 		ra_block_find_definers(ctx, block);
    551 	}
    552 
    553 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
    554 		ra_block_name_instructions(ctx, block);
    555 	}
    556 
    557 	/* figure out the base register name for each class.  The
    558 	 * actual ra name is class_base[cls] + instr->name;
    559 	 */
    560 	ctx->class_base[0] = 0;
    561 	for (unsigned i = 1; i <= total_class_count; i++) {
    562 		ctx->class_base[i] = ctx->class_base[i-1] +
    563 				ctx->class_alloc_count[i-1];
    564 	}
    565 
    566 	/* and vreg names for array elements: */
    567 	base = ctx->class_base[total_class_count];
    568 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
    569 		arr->base = base;
    570 		ctx->class_alloc_count[total_class_count] += arr->length;
    571 		base += arr->length;
    572 	}
    573 	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
    574 
    575 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
    576 	ralloc_steal(ctx->g, ctx->instrd);
    577 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
    578 	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
    579 }
    580 
    581 static unsigned
    582 __ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
    583 {
    584 	unsigned name;
    585 	debug_assert(cls >= 0);
    586 	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
    587 	name = ctx->class_base[cls] + defn->name;
    588 	debug_assert(name < ctx->alloc_count);
    589 	return name;
    590 }
    591 
    592 static int
    593 ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
    594 {
    595 	/* TODO handle name mapping for arrays */
    596 	return __ra_name(ctx, id->cls, id->defn);
    597 }
    598 
    599 static void
    600 ra_destroy(struct ir3_ra_ctx *ctx)
    601 {
    602 	ralloc_free(ctx->g);
    603 }
    604 
    605 static void
    606 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
    607 {
    608 	struct ir3_ra_block_data *bd;
    609 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
    610 
    611 #define def(name, instr) \
    612 		do { \
    613 			/* defined on first write: */ \
    614 			if (!ctx->def[name]) \
    615 				ctx->def[name] = instr->ip; \
    616 			ctx->use[name] = instr->ip; \
    617 			BITSET_SET(bd->def, name); \
    618 		} while(0);
    619 
    620 #define use(name, instr) \
    621 		do { \
    622 			ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
    623 			if (!BITSET_TEST(bd->def, name)) \
    624 				BITSET_SET(bd->use, name); \
    625 		} while(0);
    626 
    627 	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
    628 
    629 	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
    630 	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
    631 	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
    632 	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
    633 
    634 	block->data = bd;
    635 
    636 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
    637 		struct ir3_instruction *src;
    638 		struct ir3_register *reg;
    639 
    640 		if (instr->regs_count == 0)
    641 			continue;
    642 
    643 		/* There are a couple special cases to deal with here:
    644 		 *
    645 		 * fanout: used to split values from a higher class to a lower
    646 		 *     class, for example split the results of a texture fetch
    647 		 *     into individual scalar values;  We skip over these from
    648 		 *     a 'def' perspective, and for a 'use' we walk the chain
    649 		 *     up to the defining instruction.
    650 		 *
    651 		 * fanin: used to collect values from lower class and assemble
    652 		 *     them together into a higher class, for example arguments
    653 		 *     to texture sample instructions;  We consider these to be
    654 		 *     defined at the earliest fanin source.
    655 		 *
    656 		 * phi: used to merge values from different flow control paths
    657 		 *     to the same reg.  Consider defined at earliest phi src,
    658 		 *     and update all the other phi src's (which may come later
    659 		 *     in the program) as users to extend the var's live range.
    660 		 *
    661 		 * Most of this, other than phi, is completely handled in the
    662 		 * get_definer() helper.
    663 		 *
    664 		 * In either case, we trace the instruction back to the original
    665 		 * definer and consider that as the def/use ip.
    666 		 */
    667 
    668 		if (writes_gpr(instr)) {
    669 			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
    670 			struct ir3_register *dst = instr->regs[0];
    671 
    672 			if (dst->flags & IR3_REG_ARRAY) {
    673 				struct ir3_array *arr =
    674 					ir3_lookup_array(ctx->ir, dst->array.id);
    675 				unsigned i;
    676 
    677 				debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
    678 
    679 				arr->start_ip = MIN2(arr->start_ip, instr->ip);
    680 				arr->end_ip = MAX2(arr->end_ip, instr->ip);
    681 
    682 				/* set the node class now.. in case we don't encounter
    683 				 * this array dst again.  From register_alloc algo's
    684 				 * perspective, these are all single/scalar regs:
    685 				 */
    686 				for (i = 0; i < arr->length; i++) {
    687 					unsigned name = arr->base + i;
    688 					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
    689 				}
    690 
    691 				/* indirect write is treated like a write to all array
    692 				 * elements, since we don't know which one is actually
    693 				 * written:
    694 				 */
    695 				if (dst->flags & IR3_REG_RELATIV) {
    696 					for (i = 0; i < arr->length; i++) {
    697 						unsigned name = arr->base + i;
    698 						def(name, instr);
    699 					}
    700 				} else {
    701 					unsigned name = arr->base + dst->array.offset;
    702 					def(name, instr);
    703 				}
    704 
    705 			} else if (id->defn == instr) {
    706 				unsigned name = ra_name(ctx, id);
    707 
    708 				/* since we are in SSA at this point: */
    709 				debug_assert(!BITSET_TEST(bd->use, name));
    710 
    711 				def(name, id->defn);
    712 
    713 				if (is_half(id->defn)) {
    714 					ra_set_node_class(ctx->g, name,
    715 							ctx->set->half_classes[id->cls - class_count]);
    716 				} else {
    717 					ra_set_node_class(ctx->g, name,
    718 							ctx->set->classes[id->cls]);
    719 				}
    720 
    721 				/* extend the live range for phi srcs, which may come
    722 				 * from the bottom of the loop
    723 				 */
    724 				if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
    725 					struct ir3_instruction *phi = id->defn->regs[0]->instr;
    726 					foreach_ssa_src(src, phi) {
    727 						/* if src is after phi, then we need to extend
    728 						 * the liverange to the end of src's block:
    729 						 */
    730 						if (src->ip > phi->ip) {
    731 							struct ir3_instruction *last =
    732 									list_last_entry(&src->block->instr_list,
    733 											struct ir3_instruction, node);
    734 							ctx->use[name] = MAX2(ctx->use[name], last->ip);
    735 						}
    736 					}
    737 				}
    738 			}
    739 		}
    740 
    741 		foreach_src(reg, instr) {
    742 			if (reg->flags & IR3_REG_ARRAY) {
    743 				struct ir3_array *arr =
    744 					ir3_lookup_array(ctx->ir, reg->array.id);
    745 				arr->start_ip = MIN2(arr->start_ip, instr->ip);
    746 				arr->end_ip = MAX2(arr->end_ip, instr->ip);
    747 				/* indirect read is treated like a read fromall array
    748 				 * elements, since we don't know which one is actually
    749 				 * read:
    750 				 */
    751 				if (reg->flags & IR3_REG_RELATIV) {
    752 					unsigned i;
    753 					for (i = 0; i < arr->length; i++) {
    754 						unsigned name = arr->base + i;
    755 						use(name, instr);
    756 					}
    757 				} else {
    758 					unsigned name = arr->base + reg->array.offset;
    759 					use(name, instr);
    760 					debug_assert(reg->array.offset < arr->length);
    761 				}
    762 			} else if ((src = ssa(reg)) && writes_gpr(src)) {
    763 				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
    764 				use(name, instr);
    765 			}
    766 		}
    767 	}
    768 }
    769 
    770 static bool
    771 ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
    772 {
    773 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
    774 	bool progress = false;
    775 
    776 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
    777 		struct ir3_ra_block_data *bd = block->data;
    778 
    779 		/* update livein: */
    780 		for (unsigned i = 0; i < bitset_words; i++) {
    781 			BITSET_WORD new_livein =
    782 				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
    783 
    784 			if (new_livein & ~bd->livein[i]) {
    785 				bd->livein[i] |= new_livein;
    786 				progress = true;
    787 			}
    788 		}
    789 
    790 		/* update liveout: */
    791 		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
    792 			struct ir3_block *succ = block->successors[j];
    793 			struct ir3_ra_block_data *succ_bd;
    794 
    795 			if (!succ)
    796 				continue;
    797 
    798 			succ_bd = succ->data;
    799 
    800 			for (unsigned i = 0; i < bitset_words; i++) {
    801 				BITSET_WORD new_liveout =
    802 					(succ_bd->livein[i] & ~bd->liveout[i]);
    803 
    804 				if (new_liveout) {
    805 					bd->liveout[i] |= new_liveout;
    806 					progress = true;
    807 				}
    808 			}
    809 		}
    810 	}
    811 
    812 	return progress;
    813 }
    814 
    815 static void
    816 print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
    817 {
    818 	bool first = true;
    819 	debug_printf("  %s:", name);
    820 	for (unsigned i = 0; i < cnt; i++) {
    821 		if (BITSET_TEST(bs, i)) {
    822 			if (!first)
    823 				debug_printf(",");
    824 			debug_printf(" %04u", i);
    825 			first = false;
    826 		}
    827 	}
    828 	debug_printf("\n");
    829 }
    830 
    831 static void
    832 ra_add_interference(struct ir3_ra_ctx *ctx)
    833 {
    834 	struct ir3 *ir = ctx->ir;
    835 
    836 	/* initialize array live ranges: */
    837 	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
    838 		arr->start_ip = ~0;
    839 		arr->end_ip = 0;
    840 	}
    841 
    842 	/* compute live ranges (use/def) on a block level, also updating
    843 	 * block's def/use bitmasks (used below to calculate per-block
    844 	 * livein/liveout):
    845 	 */
    846 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    847 		ra_block_compute_live_ranges(ctx, block);
    848 	}
    849 
    850 	/* update per-block livein/liveout: */
    851 	while (ra_compute_livein_liveout(ctx)) {}
    852 
    853 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
    854 		debug_printf("AFTER LIVEIN/OUT:\n");
    855 		ir3_print(ir);
    856 		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    857 			struct ir3_ra_block_data *bd = block->data;
    858 			debug_printf("block%u:\n", block_id(block));
    859 			print_bitset("def", bd->def, ctx->alloc_count);
    860 			print_bitset("use", bd->use, ctx->alloc_count);
    861 			print_bitset("l/i", bd->livein, ctx->alloc_count);
    862 			print_bitset("l/o", bd->liveout, ctx->alloc_count);
    863 		}
    864 	}
    865 
    866 	/* extend start/end ranges based on livein/liveout info from cfg: */
    867 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    868 		struct ir3_ra_block_data *bd = block->data;
    869 
    870 		for (unsigned i = 0; i < ctx->alloc_count; i++) {
    871 			if (BITSET_TEST(bd->livein, i)) {
    872 				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
    873 				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
    874 			}
    875 
    876 			if (BITSET_TEST(bd->liveout, i)) {
    877 				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
    878 				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
    879 			}
    880 		}
    881 	}
    882 
    883 	/* need to fix things up to keep outputs live: */
    884 	for (unsigned i = 0; i < ir->noutputs; i++) {
    885 		struct ir3_instruction *instr = ir->outputs[i];
    886 		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
    887 		ctx->use[name] = ctx->instr_cnt;
    888 	}
    889 
    890 	for (unsigned i = 0; i < ctx->alloc_count; i++) {
    891 		for (unsigned j = 0; j < ctx->alloc_count; j++) {
    892 			if (intersects(ctx->def[i], ctx->use[i],
    893 					ctx->def[j], ctx->use[j])) {
    894 				ra_add_node_interference(ctx->g, i, j);
    895 			}
    896 		}
    897 	}
    898 }
    899 
    900 /* some instructions need fix-up if dst register is half precision: */
    901 static void fixup_half_instr_dst(struct ir3_instruction *instr)
    902 {
    903 	switch (opc_cat(instr->opc)) {
    904 	case 1: /* move instructions */
    905 		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
    906 		break;
    907 	case 3:
    908 		switch (instr->opc) {
    909 		case OPC_MAD_F32:
    910 			instr->opc = OPC_MAD_F16;
    911 			break;
    912 		case OPC_SEL_B32:
    913 			instr->opc = OPC_SEL_B16;
    914 			break;
    915 		case OPC_SEL_S32:
    916 			instr->opc = OPC_SEL_S16;
    917 			break;
    918 		case OPC_SEL_F32:
    919 			instr->opc = OPC_SEL_F16;
    920 			break;
    921 		case OPC_SAD_S32:
    922 			instr->opc = OPC_SAD_S16;
    923 			break;
    924 		/* instructions may already be fixed up: */
    925 		case OPC_MAD_F16:
    926 		case OPC_SEL_B16:
    927 		case OPC_SEL_S16:
    928 		case OPC_SEL_F16:
    929 		case OPC_SAD_S16:
    930 			break;
    931 		default:
    932 			assert(0);
    933 			break;
    934 		}
    935 		break;
    936 	case 5:
    937 		instr->cat5.type = half_type(instr->cat5.type);
    938 		break;
    939 	}
    940 }
    941 /* some instructions need fix-up if src register is half precision: */
    942 static void fixup_half_instr_src(struct ir3_instruction *instr)
    943 {
    944 	switch (instr->opc) {
    945 	case OPC_MOV:
    946 		instr->cat1.src_type = half_type(instr->cat1.src_type);
    947 		break;
    948 	default:
    949 		break;
    950 	}
    951 }
    952 
    953 /* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
    954  * array access(es) which do not have any previous access to depend
    955  * on from scheduling point of view
    956  */
    957 static void
    958 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
    959 		struct ir3_instruction *instr)
    960 {
    961 	struct ir3_ra_instr_data *id;
    962 
    963 	if (reg->flags & IR3_REG_ARRAY) {
    964 		struct ir3_array *arr =
    965 			ir3_lookup_array(ctx->ir, reg->array.id);
    966 		unsigned name = arr->base + reg->array.offset;
    967 		unsigned r = ra_get_node_reg(ctx->g, name);
    968 		unsigned num = ctx->set->ra_reg_to_gpr[r];
    969 
    970 		if (reg->flags & IR3_REG_RELATIV) {
    971 			reg->array.offset = num;
    972 		} else {
    973 			reg->num = num;
    974 		}
    975 
    976 		reg->flags &= ~IR3_REG_ARRAY;
    977 	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
    978 		unsigned name = ra_name(ctx, id);
    979 		unsigned r = ra_get_node_reg(ctx->g, name);
    980 		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
    981 
    982 		debug_assert(!(reg->flags & IR3_REG_RELATIV));
    983 
    984 		reg->num = num;
    985 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
    986 
    987 		if (is_half(id->defn))
    988 			reg->flags |= IR3_REG_HALF;
    989 	}
    990 }
    991 
    992 static void
    993 ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
    994 {
    995 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
    996 		struct ir3_register *reg;
    997 
    998 		if (instr->regs_count == 0)
    999 			continue;
   1000 
   1001 		if (writes_gpr(instr)) {
   1002 			reg_assign(ctx, instr->regs[0], instr);
   1003 			if (instr->regs[0]->flags & IR3_REG_HALF)
   1004 				fixup_half_instr_dst(instr);
   1005 		}
   1006 
   1007 		foreach_src_n(reg, n, instr) {
   1008 			struct ir3_instruction *src = reg->instr;
   1009 			/* Note: reg->instr could be null for IR3_REG_ARRAY */
   1010 			if (!(src || (reg->flags & IR3_REG_ARRAY)))
   1011 				continue;
   1012 			reg_assign(ctx, instr->regs[n+1], src);
   1013 			if (instr->regs[n+1]->flags & IR3_REG_HALF)
   1014 				fixup_half_instr_src(instr);
   1015 		}
   1016 	}
   1017 }
   1018 
   1019 static int
   1020 ra_alloc(struct ir3_ra_ctx *ctx)
   1021 {
   1022 	unsigned n = 0;
   1023 
   1024 	/* frag shader inputs get pre-assigned, since we have some
   1025 	 * constraints/unknowns about setup for some of these regs:
   1026 	 */
   1027 	if (ctx->type == SHADER_FRAGMENT) {
   1028 		struct ir3 *ir = ctx->ir;
   1029 		unsigned i = 0, j;
   1030 		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
   1031 			struct ir3_instruction *instr = ir->inputs[i];
   1032 			int cls = size_to_class(1, true);
   1033 			unsigned name = __ra_name(ctx, cls, instr);
   1034 			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
   1035 
   1036 			/* if we have frag_face, it gets hr0.x */
   1037 			ra_set_node_reg(ctx->g, name, reg);
   1038 			i += 4;
   1039 		}
   1040 
   1041 		j = 0;
   1042 		for (; i < ir->ninputs; i++) {
   1043 			struct ir3_instruction *instr = ir->inputs[i];
   1044 			if (instr) {
   1045 				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
   1046 
   1047 				if (id->defn == instr) {
   1048 					unsigned name, reg;
   1049 
   1050 					name = ra_name(ctx, id);
   1051 					reg = ctx->set->gpr_to_ra_reg[id->cls][j];
   1052 
   1053 					ra_set_node_reg(ctx->g, name, reg);
   1054 					j += id->sz;
   1055 				}
   1056 			}
   1057 		}
   1058 		n = j;
   1059 	}
   1060 
   1061 	/* pre-assign array elements:
   1062 	 */
   1063 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
   1064 		unsigned base = n;
   1065 
   1066 		if (arr->end_ip == 0)
   1067 			continue;
   1068 
   1069 		/* figure out what else we conflict with which has already
   1070 		 * been assigned:
   1071 		 */
   1072 retry:
   1073 		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
   1074 			if (arr2 == arr)
   1075 				break;
   1076 			if (arr2->end_ip == 0)
   1077 				continue;
   1078 			/* if it intersects with liverange AND register range.. */
   1079 			if (intersects(arr->start_ip, arr->end_ip,
   1080 					arr2->start_ip, arr2->end_ip) &&
   1081 				intersects(base, base + arr->length,
   1082 					arr2->reg, arr2->reg + arr2->length)) {
   1083 				base = MAX2(base, arr2->reg + arr2->length);
   1084 				goto retry;
   1085 			}
   1086 		}
   1087 
   1088 		arr->reg = base;
   1089 
   1090 		for (unsigned i = 0; i < arr->length; i++) {
   1091 			unsigned name, reg;
   1092 
   1093 			name = arr->base + i;
   1094 			reg = ctx->set->gpr_to_ra_reg[0][base++];
   1095 
   1096 			ra_set_node_reg(ctx->g, name, reg);
   1097 		}
   1098 	}
   1099 
   1100 	if (!ra_allocate(ctx->g))
   1101 		return -1;
   1102 
   1103 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
   1104 		ra_block_alloc(ctx, block);
   1105 	}
   1106 
   1107 	return 0;
   1108 }
   1109 
   1110 int ir3_ra(struct ir3 *ir, enum shader_t type,
   1111 		bool frag_coord, bool frag_face)
   1112 {
   1113 	struct ir3_ra_ctx ctx = {
   1114 			.ir = ir,
   1115 			.type = type,
   1116 			.frag_face = frag_face,
   1117 			.set = ir->compiler->set,
   1118 	};
   1119 	int ret;
   1120 
   1121 	ra_init(&ctx);
   1122 	ra_add_interference(&ctx);
   1123 	ret = ra_alloc(&ctx);
   1124 	ra_destroy(&ctx);
   1125 
   1126 	return ret;
   1127 }
   1128