Home | History | Annotate | Download | only in ir3
      1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
      2 
      3 /*
      4  * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org>
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     23  * SOFTWARE.
     24  *
     25  * Authors:
     26  *    Rob Clark <robclark (at) freedesktop.org>
     27  */
     28 
     29 #include "util/u_math.h"
     30 #include "util/register_allocate.h"
     31 #include "util/ralloc.h"
     32 #include "util/bitset.h"
     33 
     34 #include "freedreno_util.h"
     35 
     36 #include "ir3.h"
     37 #include "ir3_compiler.h"
     38 
     39 /*
     40  * Register Assignment:
     41  *
     42  * Uses the register_allocate util, which implements graph coloring
     43  * algo with interference classes.  To handle the cases where we need
     44  * consecutive registers (for example, texture sample instructions),
     45  * we model these as larger (double/quad/etc) registers which conflict
     46  * with the corresponding registers in other classes.
     47  *
     48  * Additionally we create additional classes for half-regs, which
     49  * do not conflict with the full-reg classes.  We do need at least
     50  * sizes 1-4 (to deal w/ texture sample instructions output to half-
     51  * reg).  At the moment we don't create the higher order half-reg
     52  * classes as half-reg frequently does not have enough precision
     53  * for texture coords at higher resolutions.
     54  *
     55  * There are some additional cases that we need to handle specially,
     56  * as the graph coloring algo doesn't understand "partial writes".
     57  * For example, a sequence like:
     58  *
     59  *   add r0.z, ...
     60  *   sam (f32)(xy)r0.x, ...
     61  *   ...
     62  *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
     63  *
     64  * In this scenario, we treat r0.xyz as class size 3, which is written
     65  * (from a use/def perspective) at the 'add' instruction and ignore the
     66  * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
     67  * defining instruction, as it is the first to partially write r0.xyz.
     68  *
     69  * Note i965 has a similar scenario, which they solve with a virtual
     70  * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
     71  * register assignment.  But for us that is horrible from a scheduling
     72  * standpoint.  Instead what we do is use idea of 'definer' instruction.
     73  * Ie. the first instruction (lowest ip) to write to the variable is the
     74  * one we consider from use/def perspective when building interference
     75  * graph.  (Other instructions which write other variable components
     76  * just define the variable some more.)
     77  *
     78  * Arrays of arbitrary size are handled via pre-coloring a consecutive
     79  * sequence of registers.  Additional scalar (single component) reg
     80  * names are allocated starting at ctx->class_base[total_class_count]
     81  * (see arr->base), which are pre-colored.  In the use/def graph direct
     82  * access is treated as a single element use/def, and indirect access
     83  * is treated as use or def of all array elements.  (Only the first
     84  * def is tracked, in case of multiple indirect writes, etc.)
     85  */
     86 
     87 static const unsigned class_sizes[] = {
     88 	1, 2, 3, 4,
     89 	4 + 4, /* txd + 1d/2d */
     90 	4 + 6, /* txd + 3d */
     91 };
     92 #define class_count ARRAY_SIZE(class_sizes)
     93 
     94 static const unsigned half_class_sizes[] = {
     95 	1, 2, 3, 4,
     96 };
     97 #define half_class_count  ARRAY_SIZE(half_class_sizes)
     98 
     99 /* seems to just be used for compute shaders?  Seems like vec1 and vec3
    100  * are sufficient (for now?)
    101  */
    102 static const unsigned high_class_sizes[] = {
    103 	1, 3,
    104 };
    105 #define high_class_count ARRAY_SIZE(high_class_sizes)
    106 
    107 #define total_class_count (class_count + half_class_count + high_class_count)
    108 
    109 /* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
    110 #define NUM_REGS             (4 * 48)  /* r0 to r47 */
    111 #define NUM_HIGH_REGS        (4 * 8)   /* r48 to r55 */
    112 #define FIRST_HIGH_REG       (4 * 48)
    113 /* Number of virtual regs in a given class: */
    114 #define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
    115 #define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
    116 #define HIGH_CLASS_REGS(i)   (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
    117 
    118 #define HALF_OFFSET          (class_count)
    119 #define HIGH_OFFSET          (class_count + half_class_count)
    120 
    121 /* register-set, created one time, used for all shaders: */
    122 struct ir3_ra_reg_set {
    123 	struct ra_regs *regs;
    124 	unsigned int classes[class_count];
    125 	unsigned int half_classes[half_class_count];
    126 	unsigned int high_classes[high_class_count];
    127 	/* maps flat virtual register space to base gpr: */
    128 	uint16_t *ra_reg_to_gpr;
    129 	/* maps cls,gpr to flat virtual register space: */
    130 	uint16_t **gpr_to_ra_reg;
    131 };
    132 
    133 static void
    134 build_q_values(unsigned int **q_values, unsigned off,
    135 		const unsigned *sizes, unsigned count)
    136 {
    137 	for (unsigned i = 0; i < count; i++) {
    138 		q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
    139 
    140 		/* From register_allocate.c:
    141 		 *
    142 		 * q(B,C) (indexed by C, B is this register class) in
    143 		 * Runeson/Nystrm paper.  This is "how many registers of B could
    144 		 * the worst choice register from C conflict with".
    145 		 *
    146 		 * If we just let the register allocation algorithm compute these
    147 		 * values, is extremely expensive.  However, since all of our
    148 		 * registers are laid out, we can very easily compute them
    149 		 * ourselves.  View the register from C as fixed starting at GRF n
    150 		 * somewhere in the middle, and the register from B as sliding back
    151 		 * and forth.  Then the first register to conflict from B is the
    152 		 * one starting at n - class_size[B] + 1 and the last register to
    153 		 * conflict will start at n + class_size[B] - 1.  Therefore, the
    154 		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
    155 		 *
    156 		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
    157 		 * B | | | | | |n| --> | | | | | | |
    158 		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
    159 		 *             +-+-+-+-+-+
    160 		 * C           |n| | | | |
    161 		 *             +-+-+-+-+-+
    162 		 *
    163 		 * (Idea copied from brw_fs_reg_allocate.cpp)
    164 		 */
    165 		for (unsigned j = 0; j < count; j++)
    166 			q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
    167 	}
    168 }
    169 
    170 /* One-time setup of RA register-set, which describes all the possible
    171  * "virtual" registers and their interferences.  Ie. double register
    172  * occupies (and conflicts with) two single registers, and so forth.
    173  * Since registers do not need to be aligned to their class size, they
    174  * can conflict with other registers in the same class too.  Ie:
    175  *
    176  *    Single (base) |  Double
    177  *    --------------+---------------
    178  *       R0         |  D0
    179  *       R1         |  D0 D1
    180  *       R2         |     D1 D2
    181  *       R3         |        D2
    182  *           .. and so on..
    183  *
    184  * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
    185  * really just four scalar registers.  Don't let that confuse you.)
    186  */
    187 struct ir3_ra_reg_set *
    188 ir3_ra_alloc_reg_set(void *memctx)
    189 {
    190 	struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
    191 	unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
    192 	unsigned int **q_values;
    193 
    194 	/* calculate # of regs across all classes: */
    195 	ra_reg_count = 0;
    196 	for (unsigned i = 0; i < class_count; i++)
    197 		ra_reg_count += CLASS_REGS(i);
    198 	for (unsigned i = 0; i < half_class_count; i++)
    199 		ra_reg_count += HALF_CLASS_REGS(i);
    200 	for (unsigned i = 0; i < high_class_count; i++)
    201 		ra_reg_count += HIGH_CLASS_REGS(i);
    202 
    203 	/* allocate and populate q_values: */
    204 	q_values = ralloc_array(set, unsigned *, total_class_count);
    205 
    206 	build_q_values(q_values, 0, class_sizes, class_count);
    207 	build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
    208 	build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
    209 
    210 	/* allocate the reg-set.. */
    211 	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
    212 	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
    213 	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
    214 
    215 	/* .. and classes */
    216 	reg = 0;
    217 	for (unsigned i = 0; i < class_count; i++) {
    218 		set->classes[i] = ra_alloc_reg_class(set->regs);
    219 
    220 		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
    221 
    222 		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
    223 			ra_class_add_reg(set->regs, set->classes[i], reg);
    224 
    225 			set->ra_reg_to_gpr[reg] = j;
    226 			set->gpr_to_ra_reg[i][j] = reg;
    227 
    228 			for (unsigned br = j; br < j + class_sizes[i]; br++)
    229 				ra_add_transitive_reg_conflict(set->regs, br, reg);
    230 
    231 			reg++;
    232 		}
    233 	}
    234 
    235 	first_half_reg = reg;
    236 	base = HALF_OFFSET;
    237 
    238 	for (unsigned i = 0; i < half_class_count; i++) {
    239 		set->half_classes[i] = ra_alloc_reg_class(set->regs);
    240 
    241 		set->gpr_to_ra_reg[base + i] =
    242 				ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
    243 
    244 		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
    245 			ra_class_add_reg(set->regs, set->half_classes[i], reg);
    246 
    247 			set->ra_reg_to_gpr[reg] = j;
    248 			set->gpr_to_ra_reg[base + i][j] = reg;
    249 
    250 			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
    251 				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
    252 
    253 			reg++;
    254 		}
    255 	}
    256 
    257 	first_high_reg = reg;
    258 	base = HIGH_OFFSET;
    259 
    260 	for (unsigned i = 0; i < high_class_count; i++) {
    261 		set->high_classes[i] = ra_alloc_reg_class(set->regs);
    262 
    263 		set->gpr_to_ra_reg[base + i] =
    264 				ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
    265 
    266 		for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
    267 			ra_class_add_reg(set->regs, set->high_classes[i], reg);
    268 
    269 			set->ra_reg_to_gpr[reg] = j;
    270 			set->gpr_to_ra_reg[base + i][j] = reg;
    271 
    272 			for (unsigned br = j; br < j + high_class_sizes[i]; br++)
    273 				ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
    274 
    275 			reg++;
    276 		}
    277 	}
    278 
    279 
    280 	ra_set_finalize(set->regs, q_values);
    281 
    282 	ralloc_free(q_values);
    283 
    284 	return set;
    285 }
    286 
    287 /* additional block-data (per-block) */
    288 struct ir3_ra_block_data {
    289 	BITSET_WORD *def;        /* variables defined before used in block */
    290 	BITSET_WORD *use;        /* variables used before defined in block */
    291 	BITSET_WORD *livein;     /* which defs reach entry point of block */
    292 	BITSET_WORD *liveout;    /* which defs reach exit point of block */
    293 };
    294 
    295 /* additional instruction-data (per-instruction) */
    296 struct ir3_ra_instr_data {
    297 	/* cached instruction 'definer' info: */
    298 	struct ir3_instruction *defn;
    299 	int off, sz, cls;
    300 };
    301 
    302 /* register-assign context, per-shader */
    303 struct ir3_ra_ctx {
    304 	struct ir3 *ir;
    305 	enum shader_t type;
    306 	bool frag_face;
    307 
    308 	struct ir3_ra_reg_set *set;
    309 	struct ra_graph *g;
    310 	unsigned alloc_count;
    311 	/* one per class, plus one slot for arrays: */
    312 	unsigned class_alloc_count[total_class_count + 1];
    313 	unsigned class_base[total_class_count + 1];
    314 	unsigned instr_cnt;
    315 	unsigned *def, *use;     /* def/use table */
    316 	struct ir3_ra_instr_data *instrd;
    317 };
    318 
    319 /* does it conflict? */
    320 static inline bool
    321 intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
    322 {
    323 	return !((a_start >= b_end) || (b_start >= a_end));
    324 }
    325 
    326 static bool
    327 is_half(struct ir3_instruction *instr)
    328 {
    329 	return !!(instr->regs[0]->flags & IR3_REG_HALF);
    330 }
    331 
    332 static bool
    333 is_high(struct ir3_instruction *instr)
    334 {
    335 	return !!(instr->regs[0]->flags & IR3_REG_HIGH);
    336 }
    337 
    338 static int
    339 size_to_class(unsigned sz, bool half, bool high)
    340 {
    341 	if (high) {
    342 		for (unsigned i = 0; i < high_class_count; i++)
    343 			if (high_class_sizes[i] >= sz)
    344 				return i + HIGH_OFFSET;
    345 	} else if (half) {
    346 		for (unsigned i = 0; i < half_class_count; i++)
    347 			if (half_class_sizes[i] >= sz)
    348 				return i + HALF_OFFSET;
    349 	} else {
    350 		for (unsigned i = 0; i < class_count; i++)
    351 			if (class_sizes[i] >= sz)
    352 				return i;
    353 	}
    354 	debug_assert(0);
    355 	return -1;
    356 }
    357 
    358 static bool
    359 is_temp(struct ir3_register *reg)
    360 {
    361 	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
    362 		return false;
    363 	if ((reg->num == regid(REG_A0, 0)) ||
    364 			(reg->num == regid(REG_P0, 0)))
    365 		return false;
    366 	return true;
    367 }
    368 
    369 static bool
    370 writes_gpr(struct ir3_instruction *instr)
    371 {
    372 	if (is_store(instr))
    373 		return false;
    374 	/* is dest a normal temp register: */
    375 	return is_temp(instr->regs[0]);
    376 }
    377 
    378 static bool
    379 instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
    380 {
    381 	if (a->flags & IR3_INSTR_UNUSED)
    382 		return false;
    383 	return (a->ip < b->ip);
    384 }
    385 
    386 static struct ir3_instruction *
    387 get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
    388 		int *sz, int *off)
    389 {
    390 	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
    391 	struct ir3_instruction *d = NULL;
    392 
    393 	if (id->defn) {
    394 		*sz = id->sz;
    395 		*off = id->off;
    396 		return id->defn;
    397 	}
    398 
    399 	if (instr->opc == OPC_META_FI) {
    400 		/* What about the case where collect is subset of array, we
    401 		 * need to find the distance between where actual array starts
    402 		 * and fanin..  that probably doesn't happen currently.
    403 		 */
    404 		struct ir3_register *src;
    405 		int dsz, doff;
    406 
    407 		/* note: don't use foreach_ssa_src as this gets called once
    408 		 * while assigning regs (which clears SSA flag)
    409 		 */
    410 		foreach_src_n(src, n, instr) {
    411 			struct ir3_instruction *dd;
    412 			if (!src->instr)
    413 				continue;
    414 
    415 			dd = get_definer(ctx, src->instr, &dsz, &doff);
    416 
    417 			if ((!d) || instr_before(dd, d)) {
    418 				d = dd;
    419 				*sz = dsz;
    420 				*off = doff - n;
    421 			}
    422 		}
    423 
    424 	} else if (instr->cp.right || instr->cp.left) {
    425 		/* covers also the meta:fo case, which ends up w/ single
    426 		 * scalar instructions for each component:
    427 		 */
    428 		struct ir3_instruction *f = ir3_neighbor_first(instr);
    429 
    430 		/* by definition, the entire sequence forms one linked list
    431 		 * of single scalar register nodes (even if some of them may
    432 		 * be fanouts from a texture sample (for example) instr.  We
    433 		 * just need to walk the list finding the first element of
    434 		 * the group defined (lowest ip)
    435 		 */
    436 		int cnt = 0;
    437 
    438 		/* need to skip over unused in the group: */
    439 		while (f && (f->flags & IR3_INSTR_UNUSED)) {
    440 			f = f->cp.right;
    441 			cnt++;
    442 		}
    443 
    444 		while (f) {
    445 			if ((!d) || instr_before(f, d))
    446 				d = f;
    447 			if (f == instr)
    448 				*off = cnt;
    449 			f = f->cp.right;
    450 			cnt++;
    451 		}
    452 
    453 		*sz = cnt;
    454 
    455 	} else {
    456 		/* second case is looking directly at the instruction which
    457 		 * produces multiple values (eg, texture sample), rather
    458 		 * than the fanout nodes that point back to that instruction.
    459 		 * This isn't quite right, because it may be part of a larger
    460 		 * group, such as:
    461 		 *
    462 		 *     sam (f32)(xyzw)r0.x, ...
    463 		 *     add r1.x, ...
    464 		 *     add r1.y, ...
    465 		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
    466 		 *
    467 		 * need to come up with a better way to handle that case.
    468 		 */
    469 		if (instr->address) {
    470 			*sz = instr->regs[0]->size;
    471 		} else {
    472 			*sz = util_last_bit(instr->regs[0]->wrmask);
    473 		}
    474 		*off = 0;
    475 		d = instr;
    476 	}
    477 
    478 	if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
    479 		struct ir3_instruction *phi = d->regs[0]->instr;
    480 		struct ir3_instruction *dd;
    481 		int dsz, doff;
    482 
    483 		dd = get_definer(ctx, phi, &dsz, &doff);
    484 
    485 		*sz = MAX2(*sz, dsz);
    486 		*off = doff;
    487 
    488 		if (instr_before(dd, d)) {
    489 			d = dd;
    490 		}
    491 	}
    492 
    493 	if (d->opc == OPC_META_PHI) {
    494 		/* we have already inserted parallel-copies into
    495 		 * the phi, so we don't need to chase definers
    496 		 */
    497 		struct ir3_register *src;
    498 		struct ir3_instruction *dd = d;
    499 
    500 		/* note: don't use foreach_ssa_src as this gets called once
    501 		 * while assigning regs (which clears SSA flag)
    502 		 */
    503 		foreach_src(src, d) {
    504 			if (!src->instr)
    505 				continue;
    506 			if (instr_before(src->instr, dd))
    507 				dd = src->instr;
    508 		}
    509 
    510 		d = dd;
    511 	}
    512 
    513 	if (d->opc == OPC_META_FO) {
    514 		struct ir3_instruction *dd;
    515 		int dsz, doff;
    516 
    517 		dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
    518 
    519 		/* by definition, should come before: */
    520 		debug_assert(instr_before(dd, d));
    521 
    522 		*sz = MAX2(*sz, dsz);
    523 
    524 		debug_assert(instr->opc == OPC_META_FO);
    525 		*off = MAX2(*off, instr->fo.off);
    526 
    527 		d = dd;
    528 	}
    529 
    530 	id->defn = d;
    531 	id->sz = *sz;
    532 	id->off = *off;
    533 
    534 	return d;
    535 }
    536 
    537 static void
    538 ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
    539 {
    540 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
    541 		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
    542 		if (instr->regs_count == 0)
    543 			continue;
    544 		/* couple special cases: */
    545 		if (writes_addr(instr) || writes_pred(instr)) {
    546 			id->cls = -1;
    547 		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
    548 			id->cls = total_class_count;
    549 			id->defn = instr;
    550 		} else {
    551 			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
    552 			id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
    553 		}
    554 	}
    555 }
    556 
    557 /* give each instruction a name (and ip), and count up the # of names
    558  * of each class
    559  */
    560 static void
    561 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
    562 {
    563 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
    564 		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
    565 
    566 #ifdef DEBUG
    567 		instr->name = ~0;
    568 #endif
    569 
    570 		ctx->instr_cnt++;
    571 
    572 		if (instr->regs_count == 0)
    573 			continue;
    574 
    575 		if (!writes_gpr(instr))
    576 			continue;
    577 
    578 		if (id->defn != instr)
    579 			continue;
    580 
    581 		/* arrays which don't fit in one of the pre-defined class
    582 		 * sizes are pre-colored:
    583 		 */
    584 		if (id->cls >= 0) {
    585 			instr->name = ctx->class_alloc_count[id->cls]++;
    586 			ctx->alloc_count++;
    587 		}
    588 	}
    589 }
    590 
    591 static void
    592 ra_init(struct ir3_ra_ctx *ctx)
    593 {
    594 	unsigned n, base;
    595 
    596 	ir3_clear_mark(ctx->ir);
    597 	n = ir3_count_instructions(ctx->ir);
    598 
    599 	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
    600 
    601 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
    602 		ra_block_find_definers(ctx, block);
    603 	}
    604 
    605 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
    606 		ra_block_name_instructions(ctx, block);
    607 	}
    608 
    609 	/* figure out the base register name for each class.  The
    610 	 * actual ra name is class_base[cls] + instr->name;
    611 	 */
    612 	ctx->class_base[0] = 0;
    613 	for (unsigned i = 1; i <= total_class_count; i++) {
    614 		ctx->class_base[i] = ctx->class_base[i-1] +
    615 				ctx->class_alloc_count[i-1];
    616 	}
    617 
    618 	/* and vreg names for array elements: */
    619 	base = ctx->class_base[total_class_count];
    620 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
    621 		arr->base = base;
    622 		ctx->class_alloc_count[total_class_count] += arr->length;
    623 		base += arr->length;
    624 	}
    625 	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
    626 
    627 	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
    628 	ralloc_steal(ctx->g, ctx->instrd);
    629 	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
    630 	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
    631 }
    632 
    633 static unsigned
    634 __ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
    635 {
    636 	unsigned name;
    637 	debug_assert(cls >= 0);
    638 	debug_assert(cls < total_class_count);  /* we shouldn't get arrays here.. */
    639 	name = ctx->class_base[cls] + defn->name;
    640 	debug_assert(name < ctx->alloc_count);
    641 	return name;
    642 }
    643 
    644 static int
    645 ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
    646 {
    647 	/* TODO handle name mapping for arrays */
    648 	return __ra_name(ctx, id->cls, id->defn);
    649 }
    650 
    651 static void
    652 ra_destroy(struct ir3_ra_ctx *ctx)
    653 {
    654 	ralloc_free(ctx->g);
    655 }
    656 
    657 static void
    658 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
    659 {
    660 	struct ir3_ra_block_data *bd;
    661 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
    662 
    663 #define def(name, instr) \
    664 		do { \
    665 			/* defined on first write: */ \
    666 			if (!ctx->def[name]) \
    667 				ctx->def[name] = instr->ip; \
    668 			ctx->use[name] = instr->ip; \
    669 			BITSET_SET(bd->def, name); \
    670 		} while(0);
    671 
    672 #define use(name, instr) \
    673 		do { \
    674 			ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
    675 			if (!BITSET_TEST(bd->def, name)) \
    676 				BITSET_SET(bd->use, name); \
    677 		} while(0);
    678 
    679 	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
    680 
    681 	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
    682 	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
    683 	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
    684 	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
    685 
    686 	block->data = bd;
    687 
    688 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
    689 		struct ir3_instruction *src;
    690 		struct ir3_register *reg;
    691 
    692 		if (instr->regs_count == 0)
    693 			continue;
    694 
    695 		/* There are a couple special cases to deal with here:
    696 		 *
    697 		 * fanout: used to split values from a higher class to a lower
    698 		 *     class, for example split the results of a texture fetch
    699 		 *     into individual scalar values;  We skip over these from
    700 		 *     a 'def' perspective, and for a 'use' we walk the chain
    701 		 *     up to the defining instruction.
    702 		 *
    703 		 * fanin: used to collect values from lower class and assemble
    704 		 *     them together into a higher class, for example arguments
    705 		 *     to texture sample instructions;  We consider these to be
    706 		 *     defined at the earliest fanin source.
    707 		 *
    708 		 * phi: used to merge values from different flow control paths
    709 		 *     to the same reg.  Consider defined at earliest phi src,
    710 		 *     and update all the other phi src's (which may come later
    711 		 *     in the program) as users to extend the var's live range.
    712 		 *
    713 		 * Most of this, other than phi, is completely handled in the
    714 		 * get_definer() helper.
    715 		 *
    716 		 * In either case, we trace the instruction back to the original
    717 		 * definer and consider that as the def/use ip.
    718 		 */
    719 
    720 		if (writes_gpr(instr)) {
    721 			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
    722 			struct ir3_register *dst = instr->regs[0];
    723 
    724 			if (dst->flags & IR3_REG_ARRAY) {
    725 				struct ir3_array *arr =
    726 					ir3_lookup_array(ctx->ir, dst->array.id);
    727 				unsigned i;
    728 
    729 				debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
    730 
    731 				arr->start_ip = MIN2(arr->start_ip, instr->ip);
    732 				arr->end_ip = MAX2(arr->end_ip, instr->ip);
    733 
    734 				/* set the node class now.. in case we don't encounter
    735 				 * this array dst again.  From register_alloc algo's
    736 				 * perspective, these are all single/scalar regs:
    737 				 */
    738 				for (i = 0; i < arr->length; i++) {
    739 					unsigned name = arr->base + i;
    740 					ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
    741 				}
    742 
    743 				/* indirect write is treated like a write to all array
    744 				 * elements, since we don't know which one is actually
    745 				 * written:
    746 				 */
    747 				if (dst->flags & IR3_REG_RELATIV) {
    748 					for (i = 0; i < arr->length; i++) {
    749 						unsigned name = arr->base + i;
    750 						def(name, instr);
    751 					}
    752 				} else {
    753 					unsigned name = arr->base + dst->array.offset;
    754 					def(name, instr);
    755 				}
    756 
    757 			} else if (id->defn == instr) {
    758 				unsigned name = ra_name(ctx, id);
    759 
    760 				/* since we are in SSA at this point: */
    761 				debug_assert(!BITSET_TEST(bd->use, name));
    762 
    763 				def(name, id->defn);
    764 
    765 				if (is_high(id->defn)) {
    766 					ra_set_node_class(ctx->g, name,
    767 							ctx->set->high_classes[id->cls - HIGH_OFFSET]);
    768 				} else if (is_half(id->defn)) {
    769 					ra_set_node_class(ctx->g, name,
    770 							ctx->set->half_classes[id->cls - HALF_OFFSET]);
    771 				} else {
    772 					ra_set_node_class(ctx->g, name,
    773 							ctx->set->classes[id->cls]);
    774 				}
    775 
    776 				/* extend the live range for phi srcs, which may come
    777 				 * from the bottom of the loop
    778 				 */
    779 				if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
    780 					struct ir3_instruction *phi = id->defn->regs[0]->instr;
    781 					foreach_ssa_src(src, phi) {
    782 						/* if src is after phi, then we need to extend
    783 						 * the liverange to the end of src's block:
    784 						 */
    785 						if (src->ip > phi->ip) {
    786 							struct ir3_instruction *last =
    787 									list_last_entry(&src->block->instr_list,
    788 											struct ir3_instruction, node);
    789 							ctx->use[name] = MAX2(ctx->use[name], last->ip);
    790 						}
    791 					}
    792 				}
    793 			}
    794 		}
    795 
    796 		foreach_src(reg, instr) {
    797 			if (reg->flags & IR3_REG_ARRAY) {
    798 				struct ir3_array *arr =
    799 					ir3_lookup_array(ctx->ir, reg->array.id);
    800 				arr->start_ip = MIN2(arr->start_ip, instr->ip);
    801 				arr->end_ip = MAX2(arr->end_ip, instr->ip);
    802 				/* indirect read is treated like a read fromall array
    803 				 * elements, since we don't know which one is actually
    804 				 * read:
    805 				 */
    806 				if (reg->flags & IR3_REG_RELATIV) {
    807 					unsigned i;
    808 					for (i = 0; i < arr->length; i++) {
    809 						unsigned name = arr->base + i;
    810 						use(name, instr);
    811 					}
    812 				} else {
    813 					unsigned name = arr->base + reg->array.offset;
    814 					use(name, instr);
    815 					debug_assert(reg->array.offset < arr->length);
    816 				}
    817 			} else if ((src = ssa(reg)) && writes_gpr(src)) {
    818 				unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
    819 				use(name, instr);
    820 			}
    821 		}
    822 	}
    823 }
    824 
    825 static bool
    826 ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
    827 {
    828 	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
    829 	bool progress = false;
    830 
    831 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
    832 		struct ir3_ra_block_data *bd = block->data;
    833 
    834 		/* update livein: */
    835 		for (unsigned i = 0; i < bitset_words; i++) {
    836 			BITSET_WORD new_livein =
    837 				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
    838 
    839 			if (new_livein & ~bd->livein[i]) {
    840 				bd->livein[i] |= new_livein;
    841 				progress = true;
    842 			}
    843 		}
    844 
    845 		/* update liveout: */
    846 		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
    847 			struct ir3_block *succ = block->successors[j];
    848 			struct ir3_ra_block_data *succ_bd;
    849 
    850 			if (!succ)
    851 				continue;
    852 
    853 			succ_bd = succ->data;
    854 
    855 			for (unsigned i = 0; i < bitset_words; i++) {
    856 				BITSET_WORD new_liveout =
    857 					(succ_bd->livein[i] & ~bd->liveout[i]);
    858 
    859 				if (new_liveout) {
    860 					bd->liveout[i] |= new_liveout;
    861 					progress = true;
    862 				}
    863 			}
    864 		}
    865 	}
    866 
    867 	return progress;
    868 }
    869 
    870 static void
    871 print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
    872 {
    873 	bool first = true;
    874 	debug_printf("  %s:", name);
    875 	for (unsigned i = 0; i < cnt; i++) {
    876 		if (BITSET_TEST(bs, i)) {
    877 			if (!first)
    878 				debug_printf(",");
    879 			debug_printf(" %04u", i);
    880 			first = false;
    881 		}
    882 	}
    883 	debug_printf("\n");
    884 }
    885 
    886 static void
    887 ra_add_interference(struct ir3_ra_ctx *ctx)
    888 {
    889 	struct ir3 *ir = ctx->ir;
    890 
    891 	/* initialize array live ranges: */
    892 	list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
    893 		arr->start_ip = ~0;
    894 		arr->end_ip = 0;
    895 	}
    896 
    897 	/* compute live ranges (use/def) on a block level, also updating
    898 	 * block's def/use bitmasks (used below to calculate per-block
    899 	 * livein/liveout):
    900 	 */
    901 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    902 		ra_block_compute_live_ranges(ctx, block);
    903 	}
    904 
    905 	/* update per-block livein/liveout: */
    906 	while (ra_compute_livein_liveout(ctx)) {}
    907 
    908 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
    909 		debug_printf("AFTER LIVEIN/OUT:\n");
    910 		ir3_print(ir);
    911 		list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    912 			struct ir3_ra_block_data *bd = block->data;
    913 			debug_printf("block%u:\n", block_id(block));
    914 			print_bitset("def", bd->def, ctx->alloc_count);
    915 			print_bitset("use", bd->use, ctx->alloc_count);
    916 			print_bitset("l/i", bd->livein, ctx->alloc_count);
    917 			print_bitset("l/o", bd->liveout, ctx->alloc_count);
    918 		}
    919 	}
    920 
    921 	/* extend start/end ranges based on livein/liveout info from cfg: */
    922 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    923 		struct ir3_ra_block_data *bd = block->data;
    924 
    925 		for (unsigned i = 0; i < ctx->alloc_count; i++) {
    926 			if (BITSET_TEST(bd->livein, i)) {
    927 				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
    928 				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
    929 			}
    930 
    931 			if (BITSET_TEST(bd->liveout, i)) {
    932 				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
    933 				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
    934 			}
    935 		}
    936 	}
    937 
    938 	/* need to fix things up to keep outputs live: */
    939 	for (unsigned i = 0; i < ir->noutputs; i++) {
    940 		struct ir3_instruction *instr = ir->outputs[i];
    941 		unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
    942 		ctx->use[name] = ctx->instr_cnt;
    943 	}
    944 
    945 	for (unsigned i = 0; i < ctx->alloc_count; i++) {
    946 		for (unsigned j = 0; j < ctx->alloc_count; j++) {
    947 			if (intersects(ctx->def[i], ctx->use[i],
    948 					ctx->def[j], ctx->use[j])) {
    949 				ra_add_node_interference(ctx->g, i, j);
    950 			}
    951 		}
    952 	}
    953 }
    954 
    955 /* some instructions need fix-up if dst register is half precision: */
    956 static void fixup_half_instr_dst(struct ir3_instruction *instr)
    957 {
    958 	switch (opc_cat(instr->opc)) {
    959 	case 1: /* move instructions */
    960 		instr->cat1.dst_type = half_type(instr->cat1.dst_type);
    961 		break;
    962 	case 3:
    963 		switch (instr->opc) {
    964 		case OPC_MAD_F32:
    965 			instr->opc = OPC_MAD_F16;
    966 			break;
    967 		case OPC_SEL_B32:
    968 			instr->opc = OPC_SEL_B16;
    969 			break;
    970 		case OPC_SEL_S32:
    971 			instr->opc = OPC_SEL_S16;
    972 			break;
    973 		case OPC_SEL_F32:
    974 			instr->opc = OPC_SEL_F16;
    975 			break;
    976 		case OPC_SAD_S32:
    977 			instr->opc = OPC_SAD_S16;
    978 			break;
    979 		/* instructions may already be fixed up: */
    980 		case OPC_MAD_F16:
    981 		case OPC_SEL_B16:
    982 		case OPC_SEL_S16:
    983 		case OPC_SEL_F16:
    984 		case OPC_SAD_S16:
    985 			break;
    986 		default:
    987 			assert(0);
    988 			break;
    989 		}
    990 		break;
    991 	case 5:
    992 		instr->cat5.type = half_type(instr->cat5.type);
    993 		break;
    994 	}
    995 }
    996 /* some instructions need fix-up if src register is half precision: */
    997 static void fixup_half_instr_src(struct ir3_instruction *instr)
    998 {
    999 	switch (instr->opc) {
   1000 	case OPC_MOV:
   1001 		instr->cat1.src_type = half_type(instr->cat1.src_type);
   1002 		break;
   1003 	default:
   1004 		break;
   1005 	}
   1006 }
   1007 
   1008 /* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
   1009  * array access(es) which do not have any previous access to depend
   1010  * on from scheduling point of view
   1011  */
   1012 static void
   1013 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
   1014 		struct ir3_instruction *instr)
   1015 {
   1016 	struct ir3_ra_instr_data *id;
   1017 
   1018 	if (reg->flags & IR3_REG_ARRAY) {
   1019 		struct ir3_array *arr =
   1020 			ir3_lookup_array(ctx->ir, reg->array.id);
   1021 		unsigned name = arr->base + reg->array.offset;
   1022 		unsigned r = ra_get_node_reg(ctx->g, name);
   1023 		unsigned num = ctx->set->ra_reg_to_gpr[r];
   1024 
   1025 		if (reg->flags & IR3_REG_RELATIV) {
   1026 			reg->array.offset = num;
   1027 		} else {
   1028 			reg->num = num;
   1029 		}
   1030 
   1031 		reg->flags &= ~IR3_REG_ARRAY;
   1032 	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
   1033 		unsigned name = ra_name(ctx, id);
   1034 		unsigned r = ra_get_node_reg(ctx->g, name);
   1035 		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
   1036 
   1037 		debug_assert(!(reg->flags & IR3_REG_RELATIV));
   1038 
   1039 		if (is_high(id->defn))
   1040 			num += FIRST_HIGH_REG;
   1041 
   1042 		reg->num = num;
   1043 		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
   1044 
   1045 		if (is_half(id->defn))
   1046 			reg->flags |= IR3_REG_HALF;
   1047 	}
   1048 }
   1049 
   1050 static void
   1051 ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
   1052 {
   1053 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
   1054 		struct ir3_register *reg;
   1055 
   1056 		if (instr->regs_count == 0)
   1057 			continue;
   1058 
   1059 		if (writes_gpr(instr)) {
   1060 			reg_assign(ctx, instr->regs[0], instr);
   1061 			if (instr->regs[0]->flags & IR3_REG_HALF)
   1062 				fixup_half_instr_dst(instr);
   1063 		}
   1064 
   1065 		foreach_src_n(reg, n, instr) {
   1066 			struct ir3_instruction *src = reg->instr;
   1067 			/* Note: reg->instr could be null for IR3_REG_ARRAY */
   1068 			if (!(src || (reg->flags & IR3_REG_ARRAY)))
   1069 				continue;
   1070 			reg_assign(ctx, instr->regs[n+1], src);
   1071 			if (instr->regs[n+1]->flags & IR3_REG_HALF)
   1072 				fixup_half_instr_src(instr);
   1073 		}
   1074 	}
   1075 }
   1076 
   1077 static int
   1078 ra_alloc(struct ir3_ra_ctx *ctx)
   1079 {
   1080 	unsigned n = 0;
   1081 
   1082 	/* frag shader inputs get pre-assigned, since we have some
   1083 	 * constraints/unknowns about setup for some of these regs:
   1084 	 */
   1085 	if (ctx->type == SHADER_FRAGMENT) {
   1086 		struct ir3 *ir = ctx->ir;
   1087 		unsigned i = 0, j;
   1088 		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
   1089 			struct ir3_instruction *instr = ir->inputs[i];
   1090 			int cls = size_to_class(1, true, false);
   1091 			unsigned name = __ra_name(ctx, cls, instr);
   1092 			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
   1093 
   1094 			/* if we have frag_face, it gets hr0.x */
   1095 			ra_set_node_reg(ctx->g, name, reg);
   1096 			i += 4;
   1097 		}
   1098 
   1099 		j = 0;
   1100 		for (; i < ir->ninputs; i++) {
   1101 			struct ir3_instruction *instr = ir->inputs[i];
   1102 			if (instr) {
   1103 				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
   1104 
   1105 				if (id->defn == instr) {
   1106 					unsigned name, reg;
   1107 
   1108 					name = ra_name(ctx, id);
   1109 					reg = ctx->set->gpr_to_ra_reg[id->cls][j];
   1110 
   1111 					ra_set_node_reg(ctx->g, name, reg);
   1112 					j += id->sz;
   1113 				}
   1114 			}
   1115 		}
   1116 		n = j;
   1117 	}
   1118 
   1119 	/* pre-assign array elements:
   1120 	 */
   1121 	list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
   1122 		unsigned base = n;
   1123 
   1124 		if (arr->end_ip == 0)
   1125 			continue;
   1126 
   1127 		/* figure out what else we conflict with which has already
   1128 		 * been assigned:
   1129 		 */
   1130 retry:
   1131 		list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
   1132 			if (arr2 == arr)
   1133 				break;
   1134 			if (arr2->end_ip == 0)
   1135 				continue;
   1136 			/* if it intersects with liverange AND register range.. */
   1137 			if (intersects(arr->start_ip, arr->end_ip,
   1138 					arr2->start_ip, arr2->end_ip) &&
   1139 				intersects(base, base + arr->length,
   1140 					arr2->reg, arr2->reg + arr2->length)) {
   1141 				base = MAX2(base, arr2->reg + arr2->length);
   1142 				goto retry;
   1143 			}
   1144 		}
   1145 
   1146 		arr->reg = base;
   1147 
   1148 		for (unsigned i = 0; i < arr->length; i++) {
   1149 			unsigned name, reg;
   1150 
   1151 			name = arr->base + i;
   1152 			reg = ctx->set->gpr_to_ra_reg[0][base++];
   1153 
   1154 			ra_set_node_reg(ctx->g, name, reg);
   1155 		}
   1156 	}
   1157 
   1158 	if (!ra_allocate(ctx->g))
   1159 		return -1;
   1160 
   1161 	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
   1162 		ra_block_alloc(ctx, block);
   1163 	}
   1164 
   1165 	return 0;
   1166 }
   1167 
   1168 int ir3_ra(struct ir3 *ir, enum shader_t type,
   1169 		bool frag_coord, bool frag_face)
   1170 {
   1171 	struct ir3_ra_ctx ctx = {
   1172 			.ir = ir,
   1173 			.type = type,
   1174 			.frag_face = frag_face,
   1175 			.set = ir->compiler->set,
   1176 	};
   1177 	int ret;
   1178 
   1179 	ra_init(&ctx);
   1180 	ra_add_interference(&ctx);
   1181 	ret = ra_alloc(&ctx);
   1182 	ra_destroy(&ctx);
   1183 
   1184 	return ret;
   1185 }
   1186