Home | History | Annotate | Download | only in ir3
      1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
      2 
      3 /*
      4  * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org>
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     23  * SOFTWARE.
     24  *
     25  * Authors:
     26  *    Rob Clark <robclark (at) freedesktop.org>
     27  */
     28 
     29 #include "freedreno_util.h"
     30 
     31 #include "ir3.h"
     32 #include "ir3_shader.h"
     33 
     34 /*
     35  * Copy Propagate:
     36  */
     37 
     38 struct ir3_cp_ctx {
     39 	struct ir3 *shader;
     40 	struct ir3_shader_variant *so;
     41 	unsigned immediate_idx;
     42 };
     43 
     44 /* is it a type preserving mov, with ok flags? */
     45 static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
     46 {
     47 	if (is_same_type_mov(instr)) {
     48 		struct ir3_register *dst = instr->regs[0];
     49 		struct ir3_register *src = instr->regs[1];
     50 		struct ir3_instruction *src_instr = ssa(src);
     51 
     52 		/* only if mov src is SSA (not const/immed): */
     53 		if (!src_instr)
     54 			return false;
     55 
     56 		/* no indirect: */
     57 		if (dst->flags & IR3_REG_RELATIV)
     58 			return false;
     59 		if (src->flags & IR3_REG_RELATIV)
     60 			return false;
     61 
     62 		if (!allow_flags)
     63 			if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG |
     64 					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
     65 				return false;
     66 
     67 		/* TODO: remove this hack: */
     68 		if (src_instr->opc == OPC_META_FO)
     69 			return false;
     70 		/* TODO: we currently don't handle left/right neighbors
     71 		 * very well when inserting parallel-copies into phi..
     72 		 * to avoid problems don't eliminate a mov coming out
     73 		 * of phi..
     74 		 */
     75 		if (src_instr->opc == OPC_META_PHI)
     76 			return false;
     77 		return true;
     78 	}
     79 	return false;
     80 }
     81 
     82 static unsigned cp_flags(unsigned flags)
     83 {
     84 	/* only considering these flags (at least for now): */
     85 	flags &= (IR3_REG_CONST | IR3_REG_IMMED |
     86 			IR3_REG_FNEG | IR3_REG_FABS |
     87 			IR3_REG_SNEG | IR3_REG_SABS |
     88 			IR3_REG_BNOT | IR3_REG_RELATIV);
     89 	return flags;
     90 }
     91 
     92 static bool valid_flags(struct ir3_instruction *instr, unsigned n,
     93 		unsigned flags)
     94 {
     95 	unsigned valid_flags;
     96 	flags = cp_flags(flags);
     97 
     98 	/* If destination is indirect, then source cannot be.. at least
     99 	 * I don't think so..
    100 	 */
    101 	if ((instr->regs[0]->flags & IR3_REG_RELATIV) &&
    102 			(flags & IR3_REG_RELATIV))
    103 		return false;
    104 
    105 	/* TODO it seems to *mostly* work to cp RELATIV, except we get some
    106 	 * intermittent piglit variable-indexing fails.  Newer blob driver
    107 	 * doesn't seem to cp these.  Possibly this is hw workaround?  Not
    108 	 * sure, but until that is understood better, lets just switch off
    109 	 * cp for indirect src's:
    110 	 */
    111 	if (flags & IR3_REG_RELATIV)
    112 		return false;
    113 
    114 	/* clear flags that are 'ok' */
    115 	switch (opc_cat(instr->opc)) {
    116 	case 1:
    117 		valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV;
    118 		if (flags & ~valid_flags)
    119 			return false;
    120 		break;
    121 	case 5:
    122 		/* no flags allowed */
    123 		if (flags)
    124 			return false;
    125 		break;
    126 	case 6:
    127 		valid_flags = IR3_REG_IMMED;
    128 		if (flags & ~valid_flags)
    129 			return false;
    130 
    131 		if (flags & IR3_REG_IMMED) {
    132 			/* doesn't seem like we can have immediate src for store
    133 			 * instructions:
    134 			 *
    135 			 * TODO this restriction could also apply to load instructions,
    136 			 * but for load instructions this arg is the address (and not
    137 			 * really sure any good way to test a hard-coded immed addr src)
    138 			 */
    139 			if (is_store(instr) && (n == 1))
    140 				return false;
    141 		}
    142 
    143 		break;
    144 	case 2:
    145 		valid_flags = ir3_cat2_absneg(instr->opc) |
    146 				IR3_REG_CONST | IR3_REG_RELATIV;
    147 
    148 		if (ir3_cat2_int(instr->opc))
    149 			valid_flags |= IR3_REG_IMMED;
    150 
    151 		if (flags & ~valid_flags)
    152 			return false;
    153 
    154 		if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
    155 			unsigned m = (n ^ 1) + 1;
    156 			/* cannot deal w/ const in both srcs:
    157 			 * (note that some cat2 actually only have a single src)
    158 			 */
    159 			if (m < instr->regs_count) {
    160 				struct ir3_register *reg = instr->regs[m];
    161 				if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST))
    162 					return false;
    163 				if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED))
    164 					return false;
    165 			}
    166 			/* cannot be const + ABS|NEG: */
    167 			if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
    168 					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
    169 				return false;
    170 		}
    171 		break;
    172 	case 3:
    173 		valid_flags = ir3_cat3_absneg(instr->opc) |
    174 				IR3_REG_CONST | IR3_REG_RELATIV;
    175 
    176 		if (flags & ~valid_flags)
    177 			return false;
    178 
    179 		if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) {
    180 			/* cannot deal w/ const/relativ in 2nd src: */
    181 			if (n == 1)
    182 				return false;
    183 		}
    184 
    185 		if (flags & IR3_REG_CONST) {
    186 			/* cannot be const + ABS|NEG: */
    187 			if (flags & (IR3_REG_FABS | IR3_REG_FNEG |
    188 					IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT))
    189 				return false;
    190 		}
    191 		break;
    192 	case 4:
    193 		/* seems like blob compiler avoids const as src.. */
    194 		/* TODO double check if this is still the case on a4xx */
    195 		if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
    196 			return false;
    197 		if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
    198 			return false;
    199 		break;
    200 	}
    201 
    202 	return true;
    203 }
    204 
    205 /* propagate register flags from src to dst.. negates need special
    206  * handling to cancel each other out.
    207  */
    208 static void combine_flags(unsigned *dstflags, struct ir3_instruction *src)
    209 {
    210 	unsigned srcflags = src->regs[1]->flags;
    211 
    212 	/* if what we are combining into already has (abs) flags,
    213 	 * we can drop (neg) from src:
    214 	 */
    215 	if (*dstflags & IR3_REG_FABS)
    216 		srcflags &= ~IR3_REG_FNEG;
    217 	if (*dstflags & IR3_REG_SABS)
    218 		srcflags &= ~IR3_REG_SNEG;
    219 
    220 	if (srcflags & IR3_REG_FABS)
    221 		*dstflags |= IR3_REG_FABS;
    222 	if (srcflags & IR3_REG_SABS)
    223 		*dstflags |= IR3_REG_SABS;
    224 	if (srcflags & IR3_REG_FNEG)
    225 		*dstflags ^= IR3_REG_FNEG;
    226 	if (srcflags & IR3_REG_SNEG)
    227 		*dstflags ^= IR3_REG_SNEG;
    228 	if (srcflags & IR3_REG_BNOT)
    229 		*dstflags ^= IR3_REG_BNOT;
    230 
    231 	*dstflags &= ~IR3_REG_SSA;
    232 	*dstflags |= srcflags & IR3_REG_SSA;
    233 	*dstflags |= srcflags & IR3_REG_CONST;
    234 	*dstflags |= srcflags & IR3_REG_IMMED;
    235 	*dstflags |= srcflags & IR3_REG_RELATIV;
    236 	*dstflags |= srcflags & IR3_REG_ARRAY;
    237 
    238 	/* if src of the src is boolean we can drop the (abs) since we know
    239 	 * the source value is already a postitive integer.  This cleans
    240 	 * up the absnegs that get inserted when converting between nir and
    241 	 * native boolean (see ir3_b2n/n2b)
    242 	 */
    243 	struct ir3_instruction *srcsrc = ssa(src->regs[1]);
    244 	if (srcsrc && is_bool(srcsrc))
    245 		*dstflags &= ~IR3_REG_SABS;
    246 }
    247 
    248 static struct ir3_register *
    249 lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags)
    250 {
    251 	unsigned swiz, idx, i;
    252 
    253 	reg = ir3_reg_clone(ctx->shader, reg);
    254 
    255 	/* in some cases, there are restrictions on (abs)/(neg) plus const..
    256 	 * so just evaluate those and clear the flags:
    257 	 */
    258 	if (new_flags & IR3_REG_SABS) {
    259 		reg->iim_val = abs(reg->iim_val);
    260 		new_flags &= ~IR3_REG_SABS;
    261 	}
    262 
    263 	if (new_flags & IR3_REG_FABS) {
    264 		reg->fim_val = fabs(reg->fim_val);
    265 		new_flags &= ~IR3_REG_FABS;
    266 	}
    267 
    268 	if (new_flags & IR3_REG_SNEG) {
    269 		reg->iim_val = -reg->iim_val;
    270 		new_flags &= ~IR3_REG_SNEG;
    271 	}
    272 
    273 	if (new_flags & IR3_REG_FNEG) {
    274 		reg->fim_val = -reg->fim_val;
    275 		new_flags &= ~IR3_REG_FNEG;
    276 	}
    277 
    278 	for (i = 0; i < ctx->immediate_idx; i++) {
    279 		swiz = i % 4;
    280 		idx  = i / 4;
    281 
    282 		if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) {
    283 			break;
    284 		}
    285 	}
    286 
    287 	if (i == ctx->immediate_idx) {
    288 		/* need to generate a new immediate: */
    289 		swiz = i % 4;
    290 		idx  = i / 4;
    291 		ctx->so->immediates[idx].val[swiz] = reg->uim_val;
    292 		ctx->so->immediates_count = idx + 1;
    293 		ctx->immediate_idx++;
    294 	}
    295 
    296 	new_flags &= ~IR3_REG_IMMED;
    297 	new_flags |= IR3_REG_CONST;
    298 	reg->flags = new_flags;
    299 	reg->num = i + (4 * ctx->so->constbase.immediate);
    300 
    301 	return reg;
    302 }
    303 
    304 /**
    305  * Handle cp for a given src register.  This additionally handles
    306  * the cases of collapsing immedate/const (which replace the src
    307  * register with a non-ssa src) or collapsing mov's from relative
    308  * src (which needs to also fixup the address src reference by the
    309  * instruction).
    310  */
    311 static void
    312 reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr,
    313 		struct ir3_register *reg, unsigned n)
    314 {
    315 	struct ir3_instruction *src = ssa(reg);
    316 
    317 	/* don't propagate copies into a PHI, since we don't know if the
    318 	 * src block executed:
    319 	 */
    320 	if (instr->opc == OPC_META_PHI)
    321 		return;
    322 
    323 	if (is_eligible_mov(src, true)) {
    324 		/* simple case, no immed/const/relativ, only mov's w/ ssa src: */
    325 		struct ir3_register *src_reg = src->regs[1];
    326 		unsigned new_flags = reg->flags;
    327 
    328 		combine_flags(&new_flags, src);
    329 
    330 		if (valid_flags(instr, n, new_flags)) {
    331 			if (new_flags & IR3_REG_ARRAY) {
    332 				debug_assert(!(reg->flags & IR3_REG_ARRAY));
    333 				reg->array = src_reg->array;
    334 			}
    335 			reg->flags = new_flags;
    336 			reg->instr = ssa(src_reg);
    337 		}
    338 
    339 		src = ssa(reg);      /* could be null for IR3_REG_ARRAY case */
    340 		if (!src)
    341 			return;
    342 	} else if (is_same_type_mov(src) &&
    343 			/* cannot collapse const/immed/etc into meta instrs: */
    344 			!is_meta(instr)) {
    345 		/* immed/const/etc cases, which require some special handling: */
    346 		struct ir3_register *src_reg = src->regs[1];
    347 		unsigned new_flags = reg->flags;
    348 
    349 		combine_flags(&new_flags, src);
    350 
    351 		if (!valid_flags(instr, n, new_flags)) {
    352 			/* See if lowering an immediate to const would help. */
    353 			if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
    354 				debug_assert(new_flags & IR3_REG_IMMED);
    355 				instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags);
    356 				return;
    357 			}
    358 
    359 			/* special case for "normal" mad instructions, we can
    360 			 * try swapping the first two args if that fits better.
    361 			 *
    362 			 * the "plain" MAD's (ie. the ones that don't shift first
    363 			 * src prior to multiply) can swap their first two srcs if
    364 			 * src[0] is !CONST and src[1] is CONST:
    365 			 */
    366 			if ((n == 1) && is_mad(instr->opc) &&
    367 					!(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) &&
    368 					valid_flags(instr, 0, new_flags)) {
    369 				/* swap src[0] and src[1]: */
    370 				struct ir3_register *tmp;
    371 				tmp = instr->regs[0 + 1];
    372 				instr->regs[0 + 1] = instr->regs[1 + 1];
    373 				instr->regs[1 + 1] = tmp;
    374 				n = 0;
    375 			} else {
    376 				return;
    377 			}
    378 		}
    379 
    380 		/* Here we handle the special case of mov from
    381 		 * CONST and/or RELATIV.  These need to be handled
    382 		 * specially, because in the case of move from CONST
    383 		 * there is no src ir3_instruction so we need to
    384 		 * replace the ir3_register.  And in the case of
    385 		 * RELATIV we need to handle the address register
    386 		 * dependency.
    387 		 */
    388 		if (src_reg->flags & IR3_REG_CONST) {
    389 			/* an instruction cannot reference two different
    390 			 * address registers:
    391 			 */
    392 			if ((src_reg->flags & IR3_REG_RELATIV) &&
    393 					conflicts(instr->address, reg->instr->address))
    394 				return;
    395 
    396 			/* This seems to be a hw bug, or something where the timings
    397 			 * just somehow don't work out.  This restriction may only
    398 			 * apply if the first src is also CONST.
    399 			 */
    400 			if ((opc_cat(instr->opc) == 3) && (n == 2) &&
    401 					(src_reg->flags & IR3_REG_RELATIV) &&
    402 					(src_reg->array.offset == 0))
    403 				return;
    404 
    405 			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
    406 			src_reg->flags = new_flags;
    407 			instr->regs[n+1] = src_reg;
    408 
    409 			if (src_reg->flags & IR3_REG_RELATIV)
    410 				ir3_instr_set_address(instr, reg->instr->address);
    411 
    412 			return;
    413 		}
    414 
    415 		if ((src_reg->flags & IR3_REG_RELATIV) &&
    416 				!conflicts(instr->address, reg->instr->address)) {
    417 			src_reg = ir3_reg_clone(instr->block->shader, src_reg);
    418 			src_reg->flags = new_flags;
    419 			instr->regs[n+1] = src_reg;
    420 			ir3_instr_set_address(instr, reg->instr->address);
    421 
    422 			return;
    423 		}
    424 
    425 		/* NOTE: seems we can only do immed integers, so don't
    426 		 * need to care about float.  But we do need to handle
    427 		 * abs/neg *before* checking that the immediate requires
    428 		 * few enough bits to encode:
    429 		 *
    430 		 * TODO: do we need to do something to avoid accidentally
    431 		 * catching a float immed?
    432 		 */
    433 		if (src_reg->flags & IR3_REG_IMMED) {
    434 			int32_t iim_val = src_reg->iim_val;
    435 
    436 			debug_assert((opc_cat(instr->opc) == 1) ||
    437 					(opc_cat(instr->opc) == 6) ||
    438 					ir3_cat2_int(instr->opc));
    439 
    440 			if (new_flags & IR3_REG_SABS)
    441 				iim_val = abs(iim_val);
    442 
    443 			if (new_flags & IR3_REG_SNEG)
    444 				iim_val = -iim_val;
    445 
    446 			if (new_flags & IR3_REG_BNOT)
    447 				iim_val = ~iim_val;
    448 
    449 			/* other than category 1 (mov) we can only encode up to 10 bits: */
    450 			if ((instr->opc == OPC_MOV) ||
    451 					!((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) {
    452 				new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT);
    453 				src_reg = ir3_reg_clone(instr->block->shader, src_reg);
    454 				src_reg->flags = new_flags;
    455 				src_reg->iim_val = iim_val;
    456 				instr->regs[n+1] = src_reg;
    457 			} else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) {
    458 				/* See if lowering an immediate to const would help. */
    459 				instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags);
    460 			}
    461 
    462 			return;
    463 		}
    464 	}
    465 }
    466 
    467 /* Handle special case of eliminating output mov, and similar cases where
    468  * there isn't a normal "consuming" instruction.  In this case we cannot
    469  * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot
    470  * be eliminated)
    471  */
    472 static struct ir3_instruction *
    473 eliminate_output_mov(struct ir3_instruction *instr)
    474 {
    475 	if (is_eligible_mov(instr, false)) {
    476 		struct ir3_register *reg = instr->regs[1];
    477 		if (!(reg->flags & IR3_REG_ARRAY)) {
    478 			struct ir3_instruction *src_instr = ssa(reg);
    479 			debug_assert(src_instr);
    480 			return src_instr;
    481 		}
    482 	}
    483 	return instr;
    484 }
    485 
    486 /**
    487  * Find instruction src's which are mov's that can be collapsed, replacing
    488  * the mov dst with the mov src
    489  */
    490 static void
    491 instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr)
    492 {
    493 	struct ir3_register *reg;
    494 
    495 	if (instr->regs_count == 0)
    496 		return;
    497 
    498 	if (ir3_instr_check_mark(instr))
    499 		return;
    500 
    501 	/* walk down the graph from each src: */
    502 	foreach_src_n(reg, n, instr) {
    503 		struct ir3_instruction *src = ssa(reg);
    504 
    505 		if (!src)
    506 			continue;
    507 
    508 		instr_cp(ctx, src);
    509 
    510 		/* TODO non-indirect access we could figure out which register
    511 		 * we actually want and allow cp..
    512 		 */
    513 		if (reg->flags & IR3_REG_ARRAY)
    514 			continue;
    515 
    516 		reg_cp(ctx, instr, reg, n);
    517 	}
    518 
    519 	if (instr->regs[0]->flags & IR3_REG_ARRAY) {
    520 		struct ir3_instruction *src = ssa(instr->regs[0]);
    521 		if (src)
    522 			instr_cp(ctx, src);
    523 	}
    524 
    525 	if (instr->address) {
    526 		instr_cp(ctx, instr->address);
    527 		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
    528 	}
    529 
    530 	/* we can end up with extra cmps.s from frontend, which uses a
    531 	 *
    532 	 *    cmps.s p0.x, cond, 0
    533 	 *
    534 	 * as a way to mov into the predicate register.  But frequently 'cond'
    535 	 * is itself a cmps.s/cmps.f/cmps.u.  So detect this special case and
    536 	 * just re-write the instruction writing predicate register to get rid
    537 	 * of the double cmps.
    538 	 */
    539 	if ((instr->opc == OPC_CMPS_S) &&
    540 			(instr->regs[0]->num == regid(REG_P0, 0)) &&
    541 			ssa(instr->regs[1]) &&
    542 			(instr->regs[2]->flags & IR3_REG_IMMED) &&
    543 			(instr->regs[2]->iim_val == 0)) {
    544 		struct ir3_instruction *cond = ssa(instr->regs[1]);
    545 		switch (cond->opc) {
    546 		case OPC_CMPS_S:
    547 		case OPC_CMPS_F:
    548 		case OPC_CMPS_U:
    549 			instr->opc   = cond->opc;
    550 			instr->flags = cond->flags;
    551 			instr->cat2  = cond->cat2;
    552 			instr->address = cond->address;
    553 			instr->regs[1] = cond->regs[1];
    554 			instr->regs[2] = cond->regs[2];
    555 			break;
    556 		default:
    557 			break;
    558 		}
    559 	}
    560 }
    561 
    562 void
    563 ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so)
    564 {
    565 	struct ir3_cp_ctx ctx = {
    566 			.shader = ir,
    567 			.so = so,
    568 	};
    569 
    570 	ir3_clear_mark(ir);
    571 
    572 	for (unsigned i = 0; i < ir->noutputs; i++) {
    573 		if (ir->outputs[i]) {
    574 			instr_cp(&ctx, ir->outputs[i]);
    575 			ir->outputs[i] = eliminate_output_mov(ir->outputs[i]);
    576 		}
    577 	}
    578 
    579 	for (unsigned i = 0; i < ir->keeps_count; i++) {
    580 		instr_cp(&ctx, ir->keeps[i]);
    581 		ir->keeps[i] = eliminate_output_mov(ir->keeps[i]);
    582 	}
    583 
    584 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    585 		if (block->condition) {
    586 			instr_cp(&ctx, block->condition);
    587 			block->condition = eliminate_output_mov(block->condition);
    588 		}
    589 	}
    590 }
    591