Home | History | Annotate | Download | only in ir3
      1 /*
      2  * Copyright (c) 2013 Rob Clark <robdclark (at) gmail.com>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     21  * SOFTWARE.
     22  */
     23 
     24 #ifndef IR3_H_
     25 #define IR3_H_
     26 
     27 #include <stdint.h>
     28 #include <stdbool.h>
     29 
     30 #include "util/u_debug.h"
     31 #include "util/list.h"
     32 
     33 #include "instr-a3xx.h"
     34 #include "disasm.h"  /* TODO move 'enum shader_t' somewhere else.. */
     35 
     36 /* low level intermediate representation of an adreno shader program */
     37 
     38 struct ir3_compiler;
     39 struct ir3;
     40 struct ir3_instruction;
     41 struct ir3_block;
     42 
     43 struct ir3_info {
     44 	uint32_t gpu_id;
     45 	uint16_t sizedwords;
     46 	uint16_t instrs_count;   /* expanded to account for rpt's */
     47 	/* NOTE: max_reg, etc, does not include registers not touched
     48 	 * by the shader (ie. vertex fetched via VFD_DECODE but not
     49 	 * touched by shader)
     50 	 */
     51 	int8_t   max_reg;   /* highest GPR # used by shader */
     52 	int8_t   max_half_reg;
     53 	int16_t  max_const;
     54 };
     55 
     56 struct ir3_register {
     57 	enum {
     58 		IR3_REG_CONST  = 0x001,
     59 		IR3_REG_IMMED  = 0x002,
     60 		IR3_REG_HALF   = 0x004,
     61 		/* high registers are used for some things in compute shaders,
     62 		 * for example.  Seems to be for things that are global to all
     63 		 * threads in a wave, so possibly these are global/shared by
     64 		 * all the threads in the wave?
     65 		 */
     66 		IR3_REG_HIGH   = 0x008,
     67 		IR3_REG_RELATIV= 0x010,
     68 		IR3_REG_R      = 0x020,
     69 		/* Most instructions, it seems, can do float abs/neg but not
     70 		 * integer.  The CP pass needs to know what is intended (int or
     71 		 * float) in order to do the right thing.  For this reason the
     72 		 * abs/neg flags are split out into float and int variants.  In
     73 		 * addition, .b (bitwise) operations, the negate is actually a
     74 		 * bitwise not, so split that out into a new flag to make it
     75 		 * more clear.
     76 		 */
     77 		IR3_REG_FNEG   = 0x040,
     78 		IR3_REG_FABS   = 0x080,
     79 		IR3_REG_SNEG   = 0x100,
     80 		IR3_REG_SABS   = 0x200,
     81 		IR3_REG_BNOT   = 0x400,
     82 		IR3_REG_EVEN   = 0x800,
     83 		IR3_REG_POS_INF= 0x1000,
     84 		/* (ei) flag, end-input?  Set on last bary, presumably to signal
     85 		 * that the shader needs no more input:
     86 		 */
     87 		IR3_REG_EI     = 0x2000,
     88 		/* meta-flags, for intermediate stages of IR, ie.
     89 		 * before register assignment is done:
     90 		 */
     91 		IR3_REG_SSA    = 0x4000,   /* 'instr' is ptr to assigning instr */
     92 		IR3_REG_ARRAY  = 0x8000,
     93 		IR3_REG_PHI_SRC= 0x10000,  /* phi src, regs[0]->instr points to phi */
     94 
     95 	} flags;
     96 	union {
     97 		/* normal registers:
     98 		 * the component is in the low two bits of the reg #, so
     99 		 * rN.x becomes: (N << 2) | x
    100 		 */
    101 		int   num;
    102 		/* immediate: */
    103 		int32_t  iim_val;
    104 		uint32_t uim_val;
    105 		float    fim_val;
    106 		/* relative: */
    107 		struct {
    108 			uint16_t id;
    109 			int16_t offset;
    110 		} array;
    111 	};
    112 
    113 	/* For IR3_REG_SSA, src registers contain ptr back to assigning
    114 	 * instruction.
    115 	 *
    116 	 * For IR3_REG_ARRAY, the pointer is back to the last dependent
    117 	 * array access (although the net effect is the same, it points
    118 	 * back to a previous instruction that we depend on).
    119 	 */
    120 	struct ir3_instruction *instr;
    121 
    122 	union {
    123 		/* used for cat5 instructions, but also for internal/IR level
    124 		 * tracking of what registers are read/written by an instruction.
    125 		 * wrmask may be a bad name since it is used to represent both
    126 		 * src and dst that touch multiple adjacent registers.
    127 		 */
    128 		unsigned wrmask;
    129 		/* for relative addressing, 32bits for array size is too small,
    130 		 * but otoh we don't need to deal with disjoint sets, so instead
    131 		 * use a simple size field (number of scalar components).
    132 		 */
    133 		unsigned size;
    134 	};
    135 };
    136 
    137 /*
    138  * Stupid/simple growable array implementation:
    139  */
    140 #define DECLARE_ARRAY(type, name) \
    141 	unsigned name ## _count, name ## _sz; \
    142 	type * name;
    143 
    144 #define array_insert(ctx, arr, val) do { \
    145 		if (arr ## _count == arr ## _sz) { \
    146 			arr ## _sz = MAX2(2 * arr ## _sz, 16); \
    147 			arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
    148 		} \
    149 		arr[arr ##_count++] = val; \
    150 	} while (0)
    151 
    152 struct ir3_instruction {
    153 	struct ir3_block *block;
    154 	opc_t opc;
    155 	enum {
    156 		/* (sy) flag is set on first instruction, and after sample
    157 		 * instructions (probably just on RAW hazard).
    158 		 */
    159 		IR3_INSTR_SY    = 0x001,
    160 		/* (ss) flag is set on first instruction, and first instruction
    161 		 * to depend on the result of "long" instructions (RAW hazard):
    162 		 *
    163 		 *   rcp, rsq, log2, exp2, sin, cos, sqrt
    164 		 *
    165 		 * It seems to synchronize until all in-flight instructions are
    166 		 * completed, for example:
    167 		 *
    168 		 *   rsq hr1.w, hr1.w
    169 		 *   add.f hr2.z, (neg)hr2.z, hc0.y
    170 		 *   mul.f hr2.w, (neg)hr2.y, (neg)hr2.y
    171 		 *   rsq hr2.x, hr2.x
    172 		 *   (rpt1)nop
    173 		 *   mad.f16 hr2.w, hr2.z, hr2.z, hr2.w
    174 		 *   nop
    175 		 *   mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w
    176 		 *   (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w
    177 		 *   (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x
    178 		 *
    179 		 * The last mul.f does not have (ss) set, presumably because the
    180 		 * (ss) on the previous instruction does the job.
    181 		 *
    182 		 * The blob driver also seems to set it on WAR hazards, although
    183 		 * not really clear if this is needed or just blob compiler being
    184 		 * sloppy.  So far I haven't found a case where removing the (ss)
    185 		 * causes problems for WAR hazard, but I could just be getting
    186 		 * lucky:
    187 		 *
    188 		 *   rcp r1.y, r3.y
    189 		 *   (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z
    190 		 *
    191 		 */
    192 		IR3_INSTR_SS    = 0x002,
    193 		/* (jp) flag is set on jump targets:
    194 		 */
    195 		IR3_INSTR_JP    = 0x004,
    196 		IR3_INSTR_UL    = 0x008,
    197 		IR3_INSTR_3D    = 0x010,
    198 		IR3_INSTR_A     = 0x020,
    199 		IR3_INSTR_O     = 0x040,
    200 		IR3_INSTR_P     = 0x080,
    201 		IR3_INSTR_S     = 0x100,
    202 		IR3_INSTR_S2EN  = 0x200,
    203 		IR3_INSTR_G     = 0x400,
    204 		/* meta-flags, for intermediate stages of IR, ie.
    205 		 * before register assignment is done:
    206 		 */
    207 		IR3_INSTR_MARK  = 0x1000,
    208 		IR3_INSTR_UNUSED= 0x2000,
    209 	} flags;
    210 	int repeat;
    211 #ifdef DEBUG
    212 	unsigned regs_max;
    213 #endif
    214 	unsigned regs_count;
    215 	struct ir3_register **regs;
    216 	union {
    217 		struct {
    218 			char inv;
    219 			char comp;
    220 			int  immed;
    221 			struct ir3_block *target;
    222 		} cat0;
    223 		struct {
    224 			type_t src_type, dst_type;
    225 		} cat1;
    226 		struct {
    227 			enum {
    228 				IR3_COND_LT = 0,
    229 				IR3_COND_LE = 1,
    230 				IR3_COND_GT = 2,
    231 				IR3_COND_GE = 3,
    232 				IR3_COND_EQ = 4,
    233 				IR3_COND_NE = 5,
    234 			} condition;
    235 		} cat2;
    236 		struct {
    237 			unsigned samp, tex;
    238 			type_t type;
    239 		} cat5;
    240 		struct {
    241 			type_t type;
    242 			int src_offset;
    243 			int dst_offset;
    244 			int iim_val : 3;      /* for ldgb/stgb, # of components */
    245 			int d : 3;
    246 			bool typed : 1;
    247 		} cat6;
    248 		struct {
    249 			unsigned w : 1;       /* write */
    250 			unsigned r : 1;       /* read */
    251 			unsigned l : 1;       /* local */
    252 			unsigned g : 1;       /* global */
    253 		} cat7;
    254 		/* for meta-instructions, just used to hold extra data
    255 		 * before instruction scheduling, etc
    256 		 */
    257 		struct {
    258 			int off;              /* component/offset */
    259 		} fo;
    260 		struct {
    261 			/* used to temporarily hold reference to nir_phi_instr
    262 			 * until we resolve the phi srcs
    263 			 */
    264 			void *nphi;
    265 		} phi;
    266 		struct {
    267 			struct ir3_block *block;
    268 		} inout;
    269 	};
    270 
    271 	/* transient values used during various algorithms: */
    272 	union {
    273 		/* The instruction depth is the max dependency distance to output.
    274 		 *
    275 		 * You can also think of it as the "cost", if we did any sort of
    276 		 * optimization for register footprint.  Ie. a value that is  just
    277 		 * result of moving a const to a reg would have a low cost,  so to
    278 		 * it could make sense to duplicate the instruction at various
    279 		 * points where the result is needed to reduce register footprint.
    280 		 */
    281 		unsigned depth;
    282 		/* When we get to the RA stage, we no longer need depth, but
    283 		 * we do need instruction's position/name:
    284 		 */
    285 		struct {
    286 			uint16_t ip;
    287 			uint16_t name;
    288 		};
    289 	};
    290 
    291 	/* used for per-pass extra instruction data.
    292 	 */
    293 	void *data;
    294 
    295 	/* Used during CP and RA stages.  For fanin and shader inputs/
    296 	 * outputs where we need a sequence of consecutive registers,
    297 	 * keep track of each src instructions left (ie 'n-1') and right
    298 	 * (ie 'n+1') neighbor.  The front-end must insert enough mov's
    299 	 * to ensure that each instruction has at most one left and at
    300 	 * most one right neighbor.  During the copy-propagation pass,
    301 	 * we only remove mov's when we can preserve this constraint.
    302 	 * And during the RA stage, we use the neighbor information to
    303 	 * allocate a block of registers in one shot.
    304 	 *
    305 	 * TODO: maybe just add something like:
    306 	 *   struct ir3_instruction_ref {
    307 	 *       struct ir3_instruction *instr;
    308 	 *       unsigned cnt;
    309 	 *   }
    310 	 *
    311 	 * Or can we get away without the refcnt stuff?  It seems like
    312 	 * it should be overkill..  the problem is if, potentially after
    313 	 * already eliminating some mov's, if you have a single mov that
    314 	 * needs to be grouped with it's neighbors in two different
    315 	 * places (ex. shader output and a fanin).
    316 	 */
    317 	struct {
    318 		struct ir3_instruction *left, *right;
    319 		uint16_t left_cnt, right_cnt;
    320 	} cp;
    321 
    322 	/* an instruction can reference at most one address register amongst
    323 	 * it's src/dst registers.  Beyond that, you need to insert mov's.
    324 	 *
    325 	 * NOTE: do not write this directly, use ir3_instr_set_address()
    326 	 */
    327 	struct ir3_instruction *address;
    328 
    329 	/* Tracking for additional dependent instructions.  Used to handle
    330 	 * barriers, WAR hazards for arrays/SSBOs/etc.
    331 	 */
    332 	DECLARE_ARRAY(struct ir3_instruction *, deps);
    333 
    334 	/*
    335 	 * From PoV of instruction scheduling, not execution (ie. ignores global/
    336 	 * local distinction):
    337 	 *                            shared  image  atomic  SSBO  everything
    338 	 *   barrier()/            -   R/W     R/W    R/W     R/W       X
    339 	 *     groupMemoryBarrier()
    340 	 *   memoryBarrier()       -           R/W    R/W
    341 	 *     (but only images declared coherent?)
    342 	 *   memoryBarrierAtomic() -                  R/W
    343 	 *   memoryBarrierBuffer() -                          R/W
    344 	 *   memoryBarrierImage()  -           R/W
    345 	 *   memoryBarrierShared() -   R/W
    346 	 *
    347 	 * TODO I think for SSBO/image/shared, in cases where we can determine
    348 	 * which variable is accessed, we don't need to care about accesses to
    349 	 * different variables (unless declared coherent??)
    350 	 */
    351 	enum {
    352 		IR3_BARRIER_EVERYTHING = 1 << 0,
    353 		IR3_BARRIER_SHARED_R   = 1 << 1,
    354 		IR3_BARRIER_SHARED_W   = 1 << 2,
    355 		IR3_BARRIER_IMAGE_R    = 1 << 3,
    356 		IR3_BARRIER_IMAGE_W    = 1 << 4,
    357 		IR3_BARRIER_BUFFER_R   = 1 << 5,
    358 		IR3_BARRIER_BUFFER_W   = 1 << 6,
    359 		IR3_BARRIER_ARRAY_R    = 1 << 7,
    360 		IR3_BARRIER_ARRAY_W    = 1 << 8,
    361 	} barrier_class, barrier_conflict;
    362 
    363 	/* Entry in ir3_block's instruction list: */
    364 	struct list_head node;
    365 
    366 #ifdef DEBUG
    367 	uint32_t serialno;
    368 #endif
    369 };
    370 
    371 static inline struct ir3_instruction *
    372 ir3_neighbor_first(struct ir3_instruction *instr)
    373 {
    374 	int cnt = 0;
    375 	while (instr->cp.left) {
    376 		instr = instr->cp.left;
    377 		if (++cnt > 0xffff) {
    378 			debug_assert(0);
    379 			break;
    380 		}
    381 	}
    382 	return instr;
    383 }
    384 
    385 static inline int ir3_neighbor_count(struct ir3_instruction *instr)
    386 {
    387 	int num = 1;
    388 
    389 	debug_assert(!instr->cp.left);
    390 
    391 	while (instr->cp.right) {
    392 		num++;
    393 		instr = instr->cp.right;
    394 		if (num > 0xffff) {
    395 			debug_assert(0);
    396 			break;
    397 		}
    398 	}
    399 
    400 	return num;
    401 }
    402 
    403 struct ir3 {
    404 	struct ir3_compiler *compiler;
    405 
    406 	unsigned ninputs, noutputs;
    407 	struct ir3_instruction **inputs;
    408 	struct ir3_instruction **outputs;
    409 
    410 	/* Track bary.f (and ldlv) instructions.. this is needed in
    411 	 * scheduling to ensure that all varying fetches happen before
    412 	 * any potential kill instructions.  The hw gets grumpy if all
    413 	 * threads in a group are killed before the last bary.f gets
    414 	 * a chance to signal end of input (ei).
    415 	 */
    416 	DECLARE_ARRAY(struct ir3_instruction *, baryfs);
    417 
    418 	/* Track all indirect instructions (read and write).  To avoid
    419 	 * deadlock scenario where an address register gets scheduled,
    420 	 * but other dependent src instructions cannot be scheduled due
    421 	 * to dependency on a *different* address register value, the
    422 	 * scheduler needs to ensure that all dependencies other than
    423 	 * the instruction other than the address register are scheduled
    424 	 * before the one that writes the address register.  Having a
    425 	 * convenient list of instructions that reference some address
    426 	 * register simplifies this.
    427 	 */
    428 	DECLARE_ARRAY(struct ir3_instruction *, indirects);
    429 
    430 	/* and same for instructions that consume predicate register: */
    431 	DECLARE_ARRAY(struct ir3_instruction *, predicates);
    432 
    433 	/* Track texture sample instructions which need texture state
    434 	 * patched in (for astc-srgb workaround):
    435 	 */
    436 	DECLARE_ARRAY(struct ir3_instruction *, astc_srgb);
    437 
    438 	/* List of blocks: */
    439 	struct list_head block_list;
    440 
    441 	/* List of ir3_array's: */
    442 	struct list_head array_list;
    443 };
    444 
    445 typedef struct nir_register nir_register;
    446 
    447 struct ir3_array {
    448 	struct list_head node;
    449 	unsigned length;
    450 	unsigned id;
    451 
    452 	nir_register *r;
    453 
    454 	/* To avoid array write's from getting DCE'd, keep track of the
    455 	 * most recent write.  Any array access depends on the most
    456 	 * recent write.  This way, nothing depends on writes after the
    457 	 * last read.  But all the writes that happen before that have
    458 	 * something depending on them
    459 	 */
    460 	struct ir3_instruction *last_write;
    461 
    462 	/* extra stuff used in RA pass: */
    463 	unsigned base;      /* base vreg name */
    464 	unsigned reg;       /* base physical reg */
    465 	uint16_t start_ip, end_ip;
    466 };
    467 
    468 struct ir3_array * ir3_lookup_array(struct ir3 *ir, unsigned id);
    469 
    470 typedef struct nir_block nir_block;
    471 
    472 struct ir3_block {
    473 	struct list_head node;
    474 	struct ir3 *shader;
    475 
    476 	nir_block *nblock;
    477 
    478 	struct list_head instr_list;  /* list of ir3_instruction */
    479 
    480 	/* each block has either one or two successors.. in case of
    481 	 * two successors, 'condition' decides which one to follow.
    482 	 * A block preceding an if/else has two successors.
    483 	 */
    484 	struct ir3_instruction *condition;
    485 	struct ir3_block *successors[2];
    486 
    487 	uint16_t start_ip, end_ip;
    488 
    489 	/* Track instructions which do not write a register but other-
    490 	 * wise must not be discarded (such as kill, stg, etc)
    491 	 */
    492 	DECLARE_ARRAY(struct ir3_instruction *, keeps);
    493 
    494 	/* used for per-pass extra block data.  Mainly used right
    495 	 * now in RA step to track livein/liveout.
    496 	 */
    497 	void *data;
    498 
    499 #ifdef DEBUG
    500 	uint32_t serialno;
    501 #endif
    502 };
    503 
    504 static inline uint32_t
    505 block_id(struct ir3_block *block)
    506 {
    507 #ifdef DEBUG
    508 	return block->serialno;
    509 #else
    510 	return (uint32_t)(unsigned long)block;
    511 #endif
    512 }
    513 
    514 struct ir3 * ir3_create(struct ir3_compiler *compiler,
    515 		unsigned nin, unsigned nout);
    516 void ir3_destroy(struct ir3 *shader);
    517 void * ir3_assemble(struct ir3 *shader,
    518 		struct ir3_info *info, uint32_t gpu_id);
    519 void * ir3_alloc(struct ir3 *shader, int sz);
    520 
    521 struct ir3_block * ir3_block_create(struct ir3 *shader);
    522 
    523 struct ir3_instruction * ir3_instr_create(struct ir3_block *block, opc_t opc);
    524 struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
    525 		opc_t opc, int nreg);
    526 struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr);
    527 void ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep);
    528 const char *ir3_instr_name(struct ir3_instruction *instr);
    529 
    530 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
    531 		int num, int flags);
    532 struct ir3_register * ir3_reg_clone(struct ir3 *shader,
    533 		struct ir3_register *reg);
    534 
    535 void ir3_instr_set_address(struct ir3_instruction *instr,
    536 		struct ir3_instruction *addr);
    537 
    538 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
    539 {
    540 	if (instr->flags & IR3_INSTR_MARK)
    541 		return true;  /* already visited */
    542 	instr->flags |= IR3_INSTR_MARK;
    543 	return false;
    544 }
    545 
    546 void ir3_block_clear_mark(struct ir3_block *block);
    547 void ir3_clear_mark(struct ir3 *shader);
    548 
    549 unsigned ir3_count_instructions(struct ir3 *ir);
    550 
    551 static inline int ir3_instr_regno(struct ir3_instruction *instr,
    552 		struct ir3_register *reg)
    553 {
    554 	unsigned i;
    555 	for (i = 0; i < instr->regs_count; i++)
    556 		if (reg == instr->regs[i])
    557 			return i;
    558 	return -1;
    559 }
    560 
    561 
    562 #define MAX_ARRAYS 16
    563 
    564 /* comp:
    565  *   0 - x
    566  *   1 - y
    567  *   2 - z
    568  *   3 - w
    569  */
    570 static inline uint32_t regid(int num, int comp)
    571 {
    572 	return (num << 2) | (comp & 0x3);
    573 }
    574 
    575 static inline uint32_t reg_num(struct ir3_register *reg)
    576 {
    577 	return reg->num >> 2;
    578 }
    579 
    580 static inline uint32_t reg_comp(struct ir3_register *reg)
    581 {
    582 	return reg->num & 0x3;
    583 }
    584 
    585 static inline bool is_flow(struct ir3_instruction *instr)
    586 {
    587 	return (opc_cat(instr->opc) == 0);
    588 }
    589 
    590 static inline bool is_kill(struct ir3_instruction *instr)
    591 {
    592 	return instr->opc == OPC_KILL;
    593 }
    594 
    595 static inline bool is_nop(struct ir3_instruction *instr)
    596 {
    597 	return instr->opc == OPC_NOP;
    598 }
    599 
    600 /* Is it a non-transformative (ie. not type changing) mov?  This can
    601  * also include absneg.s/absneg.f, which for the most part can be
    602  * treated as a mov (single src argument).
    603  */
    604 static inline bool is_same_type_mov(struct ir3_instruction *instr)
    605 {
    606 	struct ir3_register *dst;
    607 
    608 	switch (instr->opc) {
    609 	case OPC_MOV:
    610 		if (instr->cat1.src_type != instr->cat1.dst_type)
    611 			return false;
    612 		break;
    613 	case OPC_ABSNEG_F:
    614 	case OPC_ABSNEG_S:
    615 		break;
    616 	default:
    617 		return false;
    618 	}
    619 
    620 	dst = instr->regs[0];
    621 
    622 	/* mov's that write to a0.x or p0.x are special: */
    623 	if (dst->num == regid(REG_P0, 0))
    624 		return false;
    625 	if (dst->num == regid(REG_A0, 0))
    626 		return false;
    627 
    628 	if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY))
    629 		return false;
    630 
    631 	return true;
    632 }
    633 
    634 static inline bool is_alu(struct ir3_instruction *instr)
    635 {
    636 	return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3);
    637 }
    638 
    639 static inline bool is_sfu(struct ir3_instruction *instr)
    640 {
    641 	return (opc_cat(instr->opc) == 4);
    642 }
    643 
    644 static inline bool is_tex(struct ir3_instruction *instr)
    645 {
    646 	return (opc_cat(instr->opc) == 5);
    647 }
    648 
    649 static inline bool is_mem(struct ir3_instruction *instr)
    650 {
    651 	return (opc_cat(instr->opc) == 6);
    652 }
    653 
    654 static inline bool is_barrier(struct ir3_instruction *instr)
    655 {
    656 	return (opc_cat(instr->opc) == 7);
    657 }
    658 
    659 static inline bool
    660 is_store(struct ir3_instruction *instr)
    661 {
    662 	/* these instructions, the "destination" register is
    663 	 * actually a source, the address to store to.
    664 	 */
    665 	switch (instr->opc) {
    666 	case OPC_STG:
    667 	case OPC_STGB:
    668 	case OPC_STIB:
    669 	case OPC_STP:
    670 	case OPC_STL:
    671 	case OPC_STLW:
    672 	case OPC_L2G:
    673 	case OPC_G2L:
    674 		return true;
    675 	default:
    676 		return false;
    677 	}
    678 }
    679 
    680 static inline bool is_load(struct ir3_instruction *instr)
    681 {
    682 	switch (instr->opc) {
    683 	case OPC_LDG:
    684 	case OPC_LDGB:
    685 	case OPC_LDL:
    686 	case OPC_LDP:
    687 	case OPC_L2G:
    688 	case OPC_LDLW:
    689 	case OPC_LDC:
    690 	case OPC_LDLV:
    691 		/* probably some others too.. */
    692 		return true;
    693 	default:
    694 		return false;
    695 	}
    696 }
    697 
    698 static inline bool is_input(struct ir3_instruction *instr)
    699 {
    700 	/* in some cases, ldlv is used to fetch varying without
    701 	 * interpolation.. fortunately inloc is the first src
    702 	 * register in either case
    703 	 */
    704 	switch (instr->opc) {
    705 	case OPC_LDLV:
    706 	case OPC_BARY_F:
    707 		return true;
    708 	default:
    709 		return false;
    710 	}
    711 }
    712 
    713 static inline bool is_bool(struct ir3_instruction *instr)
    714 {
    715 	switch (instr->opc) {
    716 	case OPC_CMPS_F:
    717 	case OPC_CMPS_S:
    718 	case OPC_CMPS_U:
    719 		return true;
    720 	default:
    721 		return false;
    722 	}
    723 }
    724 
    725 static inline bool is_meta(struct ir3_instruction *instr)
    726 {
    727 	/* TODO how should we count PHI (and maybe fan-in/out) which
    728 	 * might actually contribute some instructions to the final
    729 	 * result?
    730 	 */
    731 	return (opc_cat(instr->opc) == -1);
    732 }
    733 
    734 static inline bool writes_addr(struct ir3_instruction *instr)
    735 {
    736 	if (instr->regs_count > 0) {
    737 		struct ir3_register *dst = instr->regs[0];
    738 		return reg_num(dst) == REG_A0;
    739 	}
    740 	return false;
    741 }
    742 
    743 static inline bool writes_pred(struct ir3_instruction *instr)
    744 {
    745 	if (instr->regs_count > 0) {
    746 		struct ir3_register *dst = instr->regs[0];
    747 		return reg_num(dst) == REG_P0;
    748 	}
    749 	return false;
    750 }
    751 
    752 /* returns defining instruction for reg */
    753 /* TODO better name */
    754 static inline struct ir3_instruction *ssa(struct ir3_register *reg)
    755 {
    756 	if (reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) {
    757 		debug_assert(!(reg->instr && (reg->instr->flags & IR3_INSTR_UNUSED)));
    758 		return reg->instr;
    759 	}
    760 	return NULL;
    761 }
    762 
    763 static inline bool conflicts(struct ir3_instruction *a,
    764 		struct ir3_instruction *b)
    765 {
    766 	return (a && b) && (a != b);
    767 }
    768 
    769 static inline bool reg_gpr(struct ir3_register *r)
    770 {
    771 	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
    772 		return false;
    773 	if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
    774 		return false;
    775 	return true;
    776 }
    777 
    778 static inline type_t half_type(type_t type)
    779 {
    780 	switch (type) {
    781 	case TYPE_F32: return TYPE_F16;
    782 	case TYPE_U32: return TYPE_U16;
    783 	case TYPE_S32: return TYPE_S16;
    784 	case TYPE_F16:
    785 	case TYPE_U16:
    786 	case TYPE_S16:
    787 		return type;
    788 	default:
    789 		assert(0);
    790 		return ~0;
    791 	}
    792 }
    793 
    794 /* some cat2 instructions (ie. those which are not float) can embed an
    795  * immediate:
    796  */
    797 static inline bool ir3_cat2_int(opc_t opc)
    798 {
    799 	switch (opc) {
    800 	case OPC_ADD_U:
    801 	case OPC_ADD_S:
    802 	case OPC_SUB_U:
    803 	case OPC_SUB_S:
    804 	case OPC_CMPS_U:
    805 	case OPC_CMPS_S:
    806 	case OPC_MIN_U:
    807 	case OPC_MIN_S:
    808 	case OPC_MAX_U:
    809 	case OPC_MAX_S:
    810 	case OPC_CMPV_U:
    811 	case OPC_CMPV_S:
    812 	case OPC_MUL_U:
    813 	case OPC_MUL_S:
    814 	case OPC_MULL_U:
    815 	case OPC_CLZ_S:
    816 	case OPC_ABSNEG_S:
    817 	case OPC_AND_B:
    818 	case OPC_OR_B:
    819 	case OPC_NOT_B:
    820 	case OPC_XOR_B:
    821 	case OPC_BFREV_B:
    822 	case OPC_CLZ_B:
    823 	case OPC_SHL_B:
    824 	case OPC_SHR_B:
    825 	case OPC_ASHR_B:
    826 	case OPC_MGEN_B:
    827 	case OPC_GETBIT_B:
    828 	case OPC_CBITS_B:
    829 	case OPC_BARY_F:
    830 		return true;
    831 
    832 	default:
    833 		return false;
    834 	}
    835 }
    836 
    837 
    838 /* map cat2 instruction to valid abs/neg flags: */
    839 static inline unsigned ir3_cat2_absneg(opc_t opc)
    840 {
    841 	switch (opc) {
    842 	case OPC_ADD_F:
    843 	case OPC_MIN_F:
    844 	case OPC_MAX_F:
    845 	case OPC_MUL_F:
    846 	case OPC_SIGN_F:
    847 	case OPC_CMPS_F:
    848 	case OPC_ABSNEG_F:
    849 	case OPC_CMPV_F:
    850 	case OPC_FLOOR_F:
    851 	case OPC_CEIL_F:
    852 	case OPC_RNDNE_F:
    853 	case OPC_RNDAZ_F:
    854 	case OPC_TRUNC_F:
    855 	case OPC_BARY_F:
    856 		return IR3_REG_FABS | IR3_REG_FNEG;
    857 
    858 	case OPC_ADD_U:
    859 	case OPC_ADD_S:
    860 	case OPC_SUB_U:
    861 	case OPC_SUB_S:
    862 	case OPC_CMPS_U:
    863 	case OPC_CMPS_S:
    864 	case OPC_MIN_U:
    865 	case OPC_MIN_S:
    866 	case OPC_MAX_U:
    867 	case OPC_MAX_S:
    868 	case OPC_CMPV_U:
    869 	case OPC_CMPV_S:
    870 	case OPC_MUL_U:
    871 	case OPC_MUL_S:
    872 	case OPC_MULL_U:
    873 	case OPC_CLZ_S:
    874 		return 0;
    875 
    876 	case OPC_ABSNEG_S:
    877 		return IR3_REG_SABS | IR3_REG_SNEG;
    878 
    879 	case OPC_AND_B:
    880 	case OPC_OR_B:
    881 	case OPC_NOT_B:
    882 	case OPC_XOR_B:
    883 	case OPC_BFREV_B:
    884 	case OPC_CLZ_B:
    885 	case OPC_SHL_B:
    886 	case OPC_SHR_B:
    887 	case OPC_ASHR_B:
    888 	case OPC_MGEN_B:
    889 	case OPC_GETBIT_B:
    890 	case OPC_CBITS_B:
    891 		return IR3_REG_BNOT;
    892 
    893 	default:
    894 		return 0;
    895 	}
    896 }
    897 
    898 /* map cat3 instructions to valid abs/neg flags: */
    899 static inline unsigned ir3_cat3_absneg(opc_t opc)
    900 {
    901 	switch (opc) {
    902 	case OPC_MAD_F16:
    903 	case OPC_MAD_F32:
    904 	case OPC_SEL_F16:
    905 	case OPC_SEL_F32:
    906 		return IR3_REG_FNEG;
    907 
    908 	case OPC_MAD_U16:
    909 	case OPC_MADSH_U16:
    910 	case OPC_MAD_S16:
    911 	case OPC_MADSH_M16:
    912 	case OPC_MAD_U24:
    913 	case OPC_MAD_S24:
    914 	case OPC_SEL_S16:
    915 	case OPC_SEL_S32:
    916 	case OPC_SAD_S16:
    917 	case OPC_SAD_S32:
    918 		/* neg *may* work on 3rd src.. */
    919 
    920 	case OPC_SEL_B16:
    921 	case OPC_SEL_B32:
    922 
    923 	default:
    924 		return 0;
    925 	}
    926 }
    927 
    928 #define MASK(n) ((1 << (n)) - 1)
    929 
    930 /* iterator for an instructions's sources (reg), also returns src #: */
    931 #define foreach_src_n(__srcreg, __n, __instr) \
    932 	if ((__instr)->regs_count) \
    933 		for (unsigned __cnt = (__instr)->regs_count - 1, __n = 0; __n < __cnt; __n++) \
    934 			if ((__srcreg = (__instr)->regs[__n + 1]))
    935 
    936 /* iterator for an instructions's sources (reg): */
    937 #define foreach_src(__srcreg, __instr) \
    938 	foreach_src_n(__srcreg, __i, __instr)
    939 
    940 static inline unsigned __ssa_src_cnt(struct ir3_instruction *instr)
    941 {
    942 	unsigned cnt = instr->regs_count + instr->deps_count;
    943 	if (instr->address)
    944 		cnt++;
    945 	return cnt;
    946 }
    947 
    948 static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr, unsigned n)
    949 {
    950 	if (n == (instr->regs_count + instr->deps_count))
    951 		return instr->address;
    952 	if (n >= instr->regs_count)
    953 		return instr->deps[n - instr->regs_count];
    954 	return ssa(instr->regs[n]);
    955 }
    956 
    957 static inline bool __is_false_dep(struct ir3_instruction *instr, unsigned n)
    958 {
    959 	if (n == (instr->regs_count + instr->deps_count))
    960 		return false;
    961 	if (n >= instr->regs_count)
    962 		return true;
    963 	return false;
    964 }
    965 
    966 #define __src_cnt(__instr) ((__instr)->address ? (__instr)->regs_count : (__instr)->regs_count - 1)
    967 
    968 /* iterator for an instruction's SSA sources (instr), also returns src #: */
    969 #define foreach_ssa_src_n(__srcinst, __n, __instr) \
    970 	for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; __n++) \
    971 		if ((__srcinst = __ssa_src_n(__instr, __n)))
    972 
    973 /* iterator for an instruction's SSA sources (instr): */
    974 #define foreach_ssa_src(__srcinst, __instr) \
    975 	foreach_ssa_src_n(__srcinst, __i, __instr)
    976 
    977 
    978 /* dump: */
    979 void ir3_print(struct ir3 *ir);
    980 void ir3_print_instr(struct ir3_instruction *instr);
    981 
    982 /* depth calculation: */
    983 int ir3_delayslots(struct ir3_instruction *assigner,
    984 		struct ir3_instruction *consumer, unsigned n);
    985 void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
    986 void ir3_depth(struct ir3 *ir);
    987 
    988 /* copy-propagate: */
    989 struct ir3_shader_variant;
    990 void ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
    991 
    992 /* group neighbors and insert mov's to resolve conflicts: */
    993 void ir3_group(struct ir3 *ir);
    994 
    995 /* scheduling: */
    996 void ir3_sched_add_deps(struct ir3 *ir);
    997 int ir3_sched(struct ir3 *ir);
    998 
    999 /* register assignment: */
   1000 struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(void *memctx);
   1001 int ir3_ra(struct ir3 *ir3, enum shader_t type,
   1002 		bool frag_coord, bool frag_face);
   1003 
   1004 /* legalize: */
   1005 void ir3_legalize(struct ir3 *ir, bool *has_samp, bool *has_ssbo, int *max_bary);
   1006 
   1007 /* ************************************************************************* */
   1008 /* instruction helpers */
   1009 
   1010 static inline struct ir3_instruction *
   1011 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type)
   1012 {
   1013 	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
   1014 	ir3_reg_create(instr, 0, 0);   /* dst */
   1015 	if (src->regs[0]->flags & IR3_REG_ARRAY) {
   1016 		struct ir3_register *src_reg =
   1017 			ir3_reg_create(instr, 0, IR3_REG_ARRAY);
   1018 		src_reg->array = src->regs[0]->array;
   1019 		src_reg->instr = src;
   1020 	} else {
   1021 		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
   1022 	}
   1023 	debug_assert(!(src->regs[0]->flags & IR3_REG_RELATIV));
   1024 	instr->cat1.src_type = type;
   1025 	instr->cat1.dst_type = type;
   1026 	return instr;
   1027 }
   1028 
   1029 static inline struct ir3_instruction *
   1030 ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
   1031 		type_t src_type, type_t dst_type)
   1032 {
   1033 	struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV);
   1034 	ir3_reg_create(instr, 0, 0);   /* dst */
   1035 	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
   1036 	instr->cat1.src_type = src_type;
   1037 	instr->cat1.dst_type = dst_type;
   1038 	debug_assert(!(src->regs[0]->flags & IR3_REG_ARRAY));
   1039 	return instr;
   1040 }
   1041 
   1042 static inline struct ir3_instruction *
   1043 ir3_NOP(struct ir3_block *block)
   1044 {
   1045 	return ir3_instr_create(block, OPC_NOP);
   1046 }
   1047 
   1048 #define INSTR0(name)                                                     \
   1049 static inline struct ir3_instruction *                                   \
   1050 ir3_##name(struct ir3_block *block)                                      \
   1051 {                                                                        \
   1052 	struct ir3_instruction *instr =                                      \
   1053 		ir3_instr_create(block, OPC_##name);                             \
   1054 	return instr;                                                        \
   1055 }
   1056 
   1057 #define INSTR1(name)                                                     \
   1058 static inline struct ir3_instruction *                                   \
   1059 ir3_##name(struct ir3_block *block,                                      \
   1060 		struct ir3_instruction *a, unsigned aflags)                      \
   1061 {                                                                        \
   1062 	struct ir3_instruction *instr =                                      \
   1063 		ir3_instr_create(block, OPC_##name);                             \
   1064 	ir3_reg_create(instr, 0, 0);   /* dst */                             \
   1065 	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
   1066 	return instr;                                                        \
   1067 }
   1068 
   1069 #define INSTR2(name)                                                     \
   1070 static inline struct ir3_instruction *                                   \
   1071 ir3_##name(struct ir3_block *block,                                      \
   1072 		struct ir3_instruction *a, unsigned aflags,                      \
   1073 		struct ir3_instruction *b, unsigned bflags)                      \
   1074 {                                                                        \
   1075 	struct ir3_instruction *instr =                                      \
   1076 		ir3_instr_create(block, OPC_##name);                             \
   1077 	ir3_reg_create(instr, 0, 0);   /* dst */                             \
   1078 	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
   1079 	ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b;           \
   1080 	return instr;                                                        \
   1081 }
   1082 
   1083 #define INSTR3(name)                                                     \
   1084 static inline struct ir3_instruction *                                   \
   1085 ir3_##name(struct ir3_block *block,                                      \
   1086 		struct ir3_instruction *a, unsigned aflags,                      \
   1087 		struct ir3_instruction *b, unsigned bflags,                      \
   1088 		struct ir3_instruction *c, unsigned cflags)                      \
   1089 {                                                                        \
   1090 	struct ir3_instruction *instr =                                      \
   1091 		ir3_instr_create(block, OPC_##name);                             \
   1092 	ir3_reg_create(instr, 0, 0);   /* dst */                             \
   1093 	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
   1094 	ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b;           \
   1095 	ir3_reg_create(instr, 0, IR3_REG_SSA | cflags)->instr = c;           \
   1096 	return instr;                                                        \
   1097 }
   1098 
   1099 #define INSTR4(name)                                                     \
   1100 static inline struct ir3_instruction *                                   \
   1101 ir3_##name(struct ir3_block *block,                                      \
   1102 		struct ir3_instruction *a, unsigned aflags,                      \
   1103 		struct ir3_instruction *b, unsigned bflags,                      \
   1104 		struct ir3_instruction *c, unsigned cflags,                      \
   1105 		struct ir3_instruction *d, unsigned dflags)                      \
   1106 {                                                                        \
   1107 	struct ir3_instruction *instr =                                      \
   1108 		ir3_instr_create2(block, OPC_##name, 5);                         \
   1109 	ir3_reg_create(instr, 0, 0);   /* dst */                             \
   1110 	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
   1111 	ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b;           \
   1112 	ir3_reg_create(instr, 0, IR3_REG_SSA | cflags)->instr = c;           \
   1113 	ir3_reg_create(instr, 0, IR3_REG_SSA | dflags)->instr = d;           \
   1114 	return instr;                                                        \
   1115 }
   1116 
   1117 #define INSTR4F(f, name)                                                 \
   1118 static inline struct ir3_instruction *                                   \
   1119 ir3_##name##_##f(struct ir3_block *block,                                \
   1120 		struct ir3_instruction *a, unsigned aflags,                      \
   1121 		struct ir3_instruction *b, unsigned bflags,                      \
   1122 		struct ir3_instruction *c, unsigned cflags,                      \
   1123 		struct ir3_instruction *d, unsigned dflags)                      \
   1124 {                                                                        \
   1125 	struct ir3_instruction *instr =                                      \
   1126 		ir3_instr_create2(block, OPC_##name, 5);                         \
   1127 	ir3_reg_create(instr, 0, 0);   /* dst */                             \
   1128 	ir3_reg_create(instr, 0, IR3_REG_SSA | aflags)->instr = a;           \
   1129 	ir3_reg_create(instr, 0, IR3_REG_SSA | bflags)->instr = b;           \
   1130 	ir3_reg_create(instr, 0, IR3_REG_SSA | cflags)->instr = c;           \
   1131 	ir3_reg_create(instr, 0, IR3_REG_SSA | dflags)->instr = d;           \
   1132 	instr->flags |= IR3_INSTR_##f;                                       \
   1133 	return instr;                                                        \
   1134 }
   1135 
   1136 /* cat0 instructions: */
   1137 INSTR0(BR)
   1138 INSTR0(JUMP)
   1139 INSTR1(KILL)
   1140 INSTR0(END)
   1141 
   1142 /* cat2 instructions, most 2 src but some 1 src: */
   1143 INSTR2(ADD_F)
   1144 INSTR2(MIN_F)
   1145 INSTR2(MAX_F)
   1146 INSTR2(MUL_F)
   1147 INSTR1(SIGN_F)
   1148 INSTR2(CMPS_F)
   1149 INSTR1(ABSNEG_F)
   1150 INSTR2(CMPV_F)
   1151 INSTR1(FLOOR_F)
   1152 INSTR1(CEIL_F)
   1153 INSTR1(RNDNE_F)
   1154 INSTR1(RNDAZ_F)
   1155 INSTR1(TRUNC_F)
   1156 INSTR2(ADD_U)
   1157 INSTR2(ADD_S)
   1158 INSTR2(SUB_U)
   1159 INSTR2(SUB_S)
   1160 INSTR2(CMPS_U)
   1161 INSTR2(CMPS_S)
   1162 INSTR2(MIN_U)
   1163 INSTR2(MIN_S)
   1164 INSTR2(MAX_U)
   1165 INSTR2(MAX_S)
   1166 INSTR1(ABSNEG_S)
   1167 INSTR2(AND_B)
   1168 INSTR2(OR_B)
   1169 INSTR1(NOT_B)
   1170 INSTR2(XOR_B)
   1171 INSTR2(CMPV_U)
   1172 INSTR2(CMPV_S)
   1173 INSTR2(MUL_U)
   1174 INSTR2(MUL_S)
   1175 INSTR2(MULL_U)
   1176 INSTR1(BFREV_B)
   1177 INSTR1(CLZ_S)
   1178 INSTR1(CLZ_B)
   1179 INSTR2(SHL_B)
   1180 INSTR2(SHR_B)
   1181 INSTR2(ASHR_B)
   1182 INSTR2(BARY_F)
   1183 INSTR2(MGEN_B)
   1184 INSTR2(GETBIT_B)
   1185 INSTR1(SETRM)
   1186 INSTR1(CBITS_B)
   1187 INSTR2(SHB)
   1188 INSTR2(MSAD)
   1189 
   1190 /* cat3 instructions: */
   1191 INSTR3(MAD_U16)
   1192 INSTR3(MADSH_U16)
   1193 INSTR3(MAD_S16)
   1194 INSTR3(MADSH_M16)
   1195 INSTR3(MAD_U24)
   1196 INSTR3(MAD_S24)
   1197 INSTR3(MAD_F16)
   1198 INSTR3(MAD_F32)
   1199 INSTR3(SEL_B16)
   1200 INSTR3(SEL_B32)
   1201 INSTR3(SEL_S16)
   1202 INSTR3(SEL_S32)
   1203 INSTR3(SEL_F16)
   1204 INSTR3(SEL_F32)
   1205 INSTR3(SAD_S16)
   1206 INSTR3(SAD_S32)
   1207 
   1208 /* cat4 instructions: */
   1209 INSTR1(RCP)
   1210 INSTR1(RSQ)
   1211 INSTR1(LOG2)
   1212 INSTR1(EXP2)
   1213 INSTR1(SIN)
   1214 INSTR1(COS)
   1215 INSTR1(SQRT)
   1216 
   1217 /* cat5 instructions: */
   1218 INSTR1(DSX)
   1219 INSTR1(DSY)
   1220 
   1221 static inline struct ir3_instruction *
   1222 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type,
   1223 		unsigned wrmask, unsigned flags, unsigned samp, unsigned tex,
   1224 		struct ir3_instruction *src0, struct ir3_instruction *src1)
   1225 {
   1226 	struct ir3_instruction *sam;
   1227 	struct ir3_register *reg;
   1228 
   1229 	sam = ir3_instr_create(block, opc);
   1230 	sam->flags |= flags;
   1231 	ir3_reg_create(sam, 0, 0)->wrmask = wrmask;
   1232 	if (src0) {
   1233 		reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
   1234 		reg->wrmask = (1 << (src0->regs_count - 1)) - 1;
   1235 		reg->instr = src0;
   1236 	}
   1237 	if (src1) {
   1238 		reg = ir3_reg_create(sam, 0, IR3_REG_SSA);
   1239 		reg->instr = src1;
   1240 		reg->wrmask = (1 << (src1->regs_count - 1)) - 1;
   1241 	}
   1242 	sam->cat5.samp = samp;
   1243 	sam->cat5.tex  = tex;
   1244 	sam->cat5.type  = type;
   1245 
   1246 	return sam;
   1247 }
   1248 
   1249 /* cat6 instructions: */
   1250 INSTR2(LDLV)
   1251 INSTR2(LDG)
   1252 INSTR2(LDL)
   1253 INSTR3(STG)
   1254 INSTR3(STL)
   1255 INSTR3(LDGB)
   1256 INSTR4(STGB)
   1257 INSTR4(STIB)
   1258 INSTR1(RESINFO)
   1259 INSTR1(RESFMT)
   1260 INSTR2(ATOMIC_ADD)
   1261 INSTR2(ATOMIC_SUB)
   1262 INSTR2(ATOMIC_XCHG)
   1263 INSTR2(ATOMIC_INC)
   1264 INSTR2(ATOMIC_DEC)
   1265 INSTR2(ATOMIC_CMPXCHG)
   1266 INSTR2(ATOMIC_MIN)
   1267 INSTR2(ATOMIC_MAX)
   1268 INSTR2(ATOMIC_AND)
   1269 INSTR2(ATOMIC_OR)
   1270 INSTR2(ATOMIC_XOR)
   1271 INSTR4F(G, ATOMIC_ADD)
   1272 INSTR4F(G, ATOMIC_SUB)
   1273 INSTR4F(G, ATOMIC_XCHG)
   1274 INSTR4F(G, ATOMIC_INC)
   1275 INSTR4F(G, ATOMIC_DEC)
   1276 INSTR4F(G, ATOMIC_CMPXCHG)
   1277 INSTR4F(G, ATOMIC_MIN)
   1278 INSTR4F(G, ATOMIC_MAX)
   1279 INSTR4F(G, ATOMIC_AND)
   1280 INSTR4F(G, ATOMIC_OR)
   1281 INSTR4F(G, ATOMIC_XOR)
   1282 
   1283 /* cat7 instructions: */
   1284 INSTR0(BAR)
   1285 INSTR0(FENCE)
   1286 
   1287 /* ************************************************************************* */
   1288 /* split this out or find some helper to use.. like main/bitset.h.. */
   1289 
   1290 #include <string.h>
   1291 
   1292 #define MAX_REG 256
   1293 
   1294 typedef uint8_t regmask_t[2 * MAX_REG / 8];
   1295 
   1296 static inline unsigned regmask_idx(struct ir3_register *reg)
   1297 {
   1298 	unsigned num = (reg->flags & IR3_REG_RELATIV) ? reg->array.offset : reg->num;
   1299 	debug_assert(num < MAX_REG);
   1300 	if (reg->flags & IR3_REG_HALF)
   1301 		num += MAX_REG;
   1302 	return num;
   1303 }
   1304 
   1305 static inline void regmask_init(regmask_t *regmask)
   1306 {
   1307 	memset(regmask, 0, sizeof(*regmask));
   1308 }
   1309 
   1310 static inline void regmask_set(regmask_t *regmask, struct ir3_register *reg)
   1311 {
   1312 	unsigned idx = regmask_idx(reg);
   1313 	if (reg->flags & IR3_REG_RELATIV) {
   1314 		unsigned i;
   1315 		for (i = 0; i < reg->size; i++, idx++)
   1316 			(*regmask)[idx / 8] |= 1 << (idx % 8);
   1317 	} else {
   1318 		unsigned mask;
   1319 		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
   1320 			if (mask & 1)
   1321 				(*regmask)[idx / 8] |= 1 << (idx % 8);
   1322 	}
   1323 }
   1324 
   1325 static inline void regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b)
   1326 {
   1327 	unsigned i;
   1328 	for (i = 0; i < ARRAY_SIZE(*dst); i++)
   1329 		(*dst)[i] = (*a)[i] | (*b)[i];
   1330 }
   1331 
   1332 /* set bits in a if not set in b, conceptually:
   1333  *   a |= (reg & ~b)
   1334  */
   1335 static inline void regmask_set_if_not(regmask_t *a,
   1336 		struct ir3_register *reg, regmask_t *b)
   1337 {
   1338 	unsigned idx = regmask_idx(reg);
   1339 	if (reg->flags & IR3_REG_RELATIV) {
   1340 		unsigned i;
   1341 		for (i = 0; i < reg->size; i++, idx++)
   1342 			if (!((*b)[idx / 8] & (1 << (idx % 8))))
   1343 				(*a)[idx / 8] |= 1 << (idx % 8);
   1344 	} else {
   1345 		unsigned mask;
   1346 		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
   1347 			if (mask & 1)
   1348 				if (!((*b)[idx / 8] & (1 << (idx % 8))))
   1349 					(*a)[idx / 8] |= 1 << (idx % 8);
   1350 	}
   1351 }
   1352 
   1353 static inline bool regmask_get(regmask_t *regmask,
   1354 		struct ir3_register *reg)
   1355 {
   1356 	unsigned idx = regmask_idx(reg);
   1357 	if (reg->flags & IR3_REG_RELATIV) {
   1358 		unsigned i;
   1359 		for (i = 0; i < reg->size; i++, idx++)
   1360 			if ((*regmask)[idx / 8] & (1 << (idx % 8)))
   1361 				return true;
   1362 	} else {
   1363 		unsigned mask;
   1364 		for (mask = reg->wrmask; mask; mask >>= 1, idx++)
   1365 			if (mask & 1)
   1366 				if ((*regmask)[idx / 8] & (1 << (idx % 8)))
   1367 					return true;
   1368 	}
   1369 	return false;
   1370 }
   1371 
   1372 /* ************************************************************************* */
   1373 
   1374 #endif /* IR3_H_ */
   1375