Home | History | Annotate | Download | only in sb
      1 /*
      2  * Copyright 2013 Vadim Girlin <vadimgirlin (at) gmail.com>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  *
     23  * Authors:
     24  *      Vadim Girlin
     25  */
     26 
     27 #define BCP_DEBUG 0
     28 
     29 #if BCP_DEBUG
     30 #define BCP_DUMP(q) do { q } while (0)
     31 #else
     32 #define BCP_DUMP(q)
     33 #endif
     34 
     35 #include "r600_pipe.h"
     36 #include "r600_shader.h"
     37 #include "eg_sq.h" // CM_V_SQ_MOVA_DST_CF_IDX0/1
     38 
     39 #include <stack>
     40 
     41 #include "sb_bc.h"
     42 #include "sb_shader.h"
     43 #include "sb_pass.h"
     44 #include "util/macros.h"
     45 
     46 namespace r600_sb {
     47 
     48 int bc_parser::decode() {
     49 
     50 	dw = bc->bytecode;
     51 	bc_ndw = bc->ndw;
     52 	max_cf = 0;
     53 
     54 	dec = new bc_decoder(ctx, dw, bc_ndw);
     55 
     56 	shader_target t = TARGET_UNKNOWN;
     57 
     58 	if (pshader) {
     59 		switch (bc->type) {
     60 		case PIPE_SHADER_FRAGMENT: t = TARGET_PS; break;
     61 		case PIPE_SHADER_VERTEX:
     62 			t = pshader->vs_as_ls ? TARGET_LS : (pshader->vs_as_es ? TARGET_ES : TARGET_VS);
     63 			break;
     64 		case PIPE_SHADER_GEOMETRY: t = TARGET_GS; break;
     65 		case PIPE_SHADER_COMPUTE: t = TARGET_COMPUTE; break;
     66 		case PIPE_SHADER_TESS_CTRL: t = TARGET_HS; break;
     67 		case PIPE_SHADER_TESS_EVAL: t = pshader->tes_as_es ? TARGET_ES : TARGET_VS; break;
     68 		default: assert(!"unknown shader target"); return -1; break;
     69 		}
     70 	} else {
     71 		if (bc->type == PIPE_SHADER_COMPUTE)
     72 			t = TARGET_COMPUTE;
     73 		else
     74 			t = TARGET_FETCH;
     75 	}
     76 
     77 	sh = new shader(ctx, t, bc->debug_id);
     78 	sh->safe_math = sb_context::safe_math || (t == TARGET_COMPUTE);
     79 
     80 	int r = decode_shader();
     81 
     82 	delete dec;
     83 
     84 	sh->ngpr = bc->ngpr;
     85 	sh->nstack = bc->nstack;
     86 
     87 	return r;
     88 }
     89 
     90 int bc_parser::decode_shader() {
     91 	int r = 0;
     92 	unsigned i = 0;
     93 	bool eop = false;
     94 
     95 	sh->init();
     96 
     97 	do {
     98 		eop = false;
     99 		if ((r = decode_cf(i, eop)))
    100 			return r;
    101 
    102 	} while (!eop || (i >> 1) < max_cf);
    103 
    104 	return 0;
    105 }
    106 
    107 int bc_parser::prepare() {
    108 	int r = 0;
    109 	if ((r = parse_decls()))
    110 		return r;
    111 	if ((r = prepare_ir()))
    112 		return r;
    113 	return 0;
    114 }
    115 
    116 int bc_parser::parse_decls() {
    117 
    118 	if (!pshader) {
    119 		if (gpr_reladdr)
    120 			sh->add_gpr_array(0, bc->ngpr, 0x0F);
    121 
    122 		// compute shaders have some values preloaded in R0, R1
    123 		sh->add_input(0 /* GPR */, true /* preloaded */, 0x0F /* mask */);
    124 		sh->add_input(1 /* GPR */, true /* preloaded */, 0x0F /* mask */);
    125 		return 0;
    126 	}
    127 
    128 	if (pshader->indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER))) {
    129 
    130 		assert(pshader->num_arrays);
    131 
    132 		if (pshader->num_arrays) {
    133 			for (unsigned i = 0; i < pshader->num_arrays; ++i) {
    134 				r600_shader_array &a = pshader->arrays[i];
    135 				sh->add_gpr_array(a.gpr_start, a.gpr_count, a.comp_mask);
    136 			}
    137 		} else {
    138 			sh->add_gpr_array(0, pshader->bc.ngpr, 0x0F);
    139 		}
    140 	}
    141 
    142 	// GS inputs can add indirect addressing
    143 	if (sh->target == TARGET_GS) {
    144 		if (pshader->num_arrays) {
    145 			for (unsigned i = 0; i < pshader->num_arrays; ++i) {
    146 				r600_shader_array &a = pshader->arrays[i];
    147 				sh->add_gpr_array(a.gpr_start, a.gpr_count, a.comp_mask);
    148 			}
    149 		}
    150 	}
    151 
    152 	if (sh->target == TARGET_VS || sh->target == TARGET_ES || sh->target == TARGET_HS)
    153 		sh->add_input(0, 1, 0x0F);
    154 	else if (sh->target == TARGET_GS) {
    155 		sh->add_input(0, 1, 0x0F);
    156 		sh->add_input(1, 1, 0x0F);
    157 	}
    158 
    159 	bool ps_interp = ctx.hw_class >= HW_CLASS_EVERGREEN
    160 			&& sh->target == TARGET_PS;
    161 
    162 	bool ij_interpolators[6];
    163 	memset(ij_interpolators, 0, sizeof(ij_interpolators));
    164 
    165 	for (unsigned i = 0; i < pshader->ninput; ++i) {
    166 		r600_shader_io & in = pshader->input[i];
    167 		bool preloaded = sh->target == TARGET_PS && !(ps_interp && in.spi_sid);
    168 		sh->add_input(in.gpr, preloaded, /*in.write_mask*/ 0x0F);
    169 		if (ps_interp && in.spi_sid) {
    170 			int k = eg_get_interpolator_index(in.interpolate, in.interpolate_location);
    171 			if (k >= 0)
    172 				ij_interpolators[k] |= true;
    173 		}
    174 	}
    175 
    176 	if (ps_interp) {
    177 		/* add the egcm ij interpolators to live inputs */
    178 		unsigned num_ij = 0;
    179 		for (unsigned i = 0; i < ARRAY_SIZE(ij_interpolators); i++) {
    180 			num_ij += ij_interpolators[i];
    181 		}
    182 
    183 		unsigned mask = (1 << (2 * num_ij)) - 1;
    184 		unsigned gpr = 0;
    185 
    186 		while (mask) {
    187 			sh->add_input(gpr, true, mask & 0x0F);
    188 			++gpr;
    189 			mask >>= 4;
    190 		}
    191 	}
    192 
    193 	return 0;
    194 }
    195 
    196 int bc_parser::decode_cf(unsigned &i, bool &eop) {
    197 
    198 	int r;
    199 
    200 	cf_node *cf = sh->create_cf();
    201 	sh->root->push_back(cf);
    202 
    203 	unsigned id = i >> 1;
    204 
    205 	cf->bc.id = id;
    206 
    207 	if (cf_map.size() < id + 1)
    208 		cf_map.resize(id + 1);
    209 
    210 	cf_map[id] = cf;
    211 
    212 	if ((r = dec->decode_cf(i, cf->bc)))
    213 		return r;
    214 
    215 	cf_op_flags flags = (cf_op_flags)cf->bc.op_ptr->flags;
    216 
    217 	if (flags & CF_ALU) {
    218 		if ((r = decode_alu_clause(cf)))
    219 			return r;
    220 	} else if (flags & CF_FETCH) {
    221 		if ((r = decode_fetch_clause(cf)))
    222 			return r;
    223 	} else if (flags & CF_EXP) {
    224 		if (cf->bc.rw_rel)
    225 			gpr_reladdr = true;
    226 		assert(!cf->bc.rw_rel);
    227 	} else if (flags & CF_MEM) {
    228 		if (cf->bc.rw_rel)
    229 			gpr_reladdr = true;
    230 		assert(!cf->bc.rw_rel);
    231 	} else if (flags & CF_BRANCH) {
    232 		if (cf->bc.addr > max_cf)
    233 			max_cf = cf->bc.addr;
    234 	}
    235 
    236 	eop = cf->bc.end_of_program || cf->bc.op == CF_OP_CF_END ||
    237 			cf->bc.op == CF_OP_RET;
    238 	return 0;
    239 }
    240 
    241 int bc_parser::decode_alu_clause(cf_node* cf) {
    242 	unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1, gcnt;
    243 
    244 	cf->subtype = NST_ALU_CLAUSE;
    245 
    246 	cgroup = 0;
    247 	memset(slots[0], 0, 5*sizeof(slots[0][0]));
    248 
    249 	unsigned ng = 0;
    250 
    251 	do {
    252 		decode_alu_group(cf, i, gcnt);
    253 		assert(gcnt <= cnt);
    254 		cnt -= gcnt;
    255 		ng++;
    256 	} while (cnt);
    257 
    258 	return 0;
    259 }
    260 
    261 int bc_parser::decode_alu_group(cf_node* cf, unsigned &i, unsigned &gcnt) {
    262 	int r;
    263 	alu_node *n;
    264 	alu_group_node *g = sh->create_alu_group();
    265 
    266 	cgroup = !cgroup;
    267 	memset(slots[cgroup], 0, 5*sizeof(slots[0][0]));
    268 	gcnt = 0;
    269 
    270 	unsigned literal_mask = 0;
    271 
    272 	do {
    273 		n = sh->create_alu();
    274 		g->push_back(n);
    275 
    276 		if ((r = dec->decode_alu(i, n->bc)))
    277 			return r;
    278 
    279 		if (!sh->assign_slot(n, slots[cgroup])) {
    280 			assert(!"alu slot assignment failed");
    281 			return -1;
    282 		}
    283 
    284 		gcnt++;
    285 
    286 	} while (gcnt <= 5 && !n->bc.last);
    287 
    288 	assert(n->bc.last);
    289 
    290 	for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) {
    291 		n = static_cast<alu_node*>(*I);
    292 
    293 		if (n->bc.dst_rel)
    294 			gpr_reladdr = true;
    295 
    296 		for (int k = 0; k < n->bc.op_ptr->src_count; ++k) {
    297 			bc_alu_src &src = n->bc.src[k];
    298 			if (src.rel)
    299 				gpr_reladdr = true;
    300 			if (src.sel == ALU_SRC_LITERAL) {
    301 				literal_mask |= (1 << src.chan);
    302 				src.value.u = dw[i + src.chan];
    303 			}
    304 		}
    305 	}
    306 
    307 	unsigned literal_ndw = 0;
    308 	while (literal_mask) {
    309 		g->literals.push_back(dw[i + literal_ndw]);
    310 		literal_ndw += 1;
    311 		literal_mask >>= 1;
    312 	}
    313 
    314 	literal_ndw = (literal_ndw + 1) & ~1u;
    315 
    316 	i += literal_ndw;
    317 	gcnt += literal_ndw >> 1;
    318 
    319 	cf->push_back(g);
    320 	return 0;
    321 }
    322 
    323 int bc_parser::prepare_alu_clause(cf_node* cf) {
    324 
    325 	// loop over alu groups
    326 	for (node_iterator I = cf->begin(), E = cf->end(); I != E; ++I) {
    327 		assert(I->subtype == NST_ALU_GROUP);
    328 		alu_group_node *g = static_cast<alu_group_node*>(*I);
    329 		prepare_alu_group(cf, g);
    330 	}
    331 
    332 	return 0;
    333 }
    334 
    335 void bc_parser::save_set_cf_index(value *val, unsigned idx)
    336 {
    337 	assert(idx <= 1);
    338 	assert(val);
    339 	cf_index_value[idx] = val;
    340 }
    341 value *bc_parser::get_cf_index_value(unsigned idx)
    342 {
    343 	assert(idx <= 1);
    344 	assert(cf_index_value[idx]);
    345 	return cf_index_value[idx];
    346 }
    347 void bc_parser::save_mova(alu_node *mova)
    348 {
    349 	assert(mova);
    350 	this->mova = mova;
    351 }
    352 alu_node *bc_parser::get_mova()
    353 {
    354 	assert(mova);
    355 	return mova;
    356 }
    357 
    358 int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
    359 
    360 	alu_node *n;
    361 
    362 	cgroup = !cgroup;
    363 	memset(slots[cgroup], 0, 5*sizeof(slots[0][0]));
    364 
    365 	for (node_iterator I = g->begin(), E = g->end();
    366 			I != E; ++I) {
    367 		n = static_cast<alu_node*>(*I);
    368 		bool ubo_indexing[2] = {};
    369 
    370 		if (!sh->assign_slot(n, slots[cgroup])) {
    371 			assert(!"alu slot assignment failed");
    372 			return -1;
    373 		}
    374 
    375 		unsigned src_count = n->bc.op_ptr->src_count;
    376 
    377 		if (ctx.alu_slots(n->bc.op) & AF_4SLOT)
    378 			n->flags |= NF_ALU_4SLOT;
    379 
    380 		n->src.resize(src_count);
    381 
    382 		unsigned flags = n->bc.op_ptr->flags;
    383 
    384 		if (flags & AF_PRED) {
    385 			n->dst.resize(3);
    386 			if (n->bc.update_pred)
    387 				n->dst[1] = sh->get_special_value(SV_ALU_PRED);
    388 			if (n->bc.update_exec_mask)
    389 				n->dst[2] = sh->get_special_value(SV_EXEC_MASK);
    390 
    391 			n->flags |= NF_DONT_HOIST;
    392 
    393 		} else if (flags & AF_KILL) {
    394 
    395 			n->dst.resize(2);
    396 			n->dst[1] = sh->get_special_value(SV_VALID_MASK);
    397 			sh->set_uses_kill();
    398 
    399 			n->flags |= NF_DONT_HOIST | NF_DONT_MOVE |
    400 					NF_DONT_KILL | NF_SCHEDULE_EARLY;
    401 
    402 		} else {
    403 			n->dst.resize(1);
    404 		}
    405 
    406 		if (n->bc.op == ALU_OP0_SET_CF_IDX0 || n->bc.op == ALU_OP0_SET_CF_IDX1) {
    407 			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
    408 			// DCE will kill this op
    409 			save_set_cf_index(get_mova()->src[0], n->bc.op == ALU_OP0_SET_CF_IDX1);
    410 		} else if (flags & AF_MOVA) {
    411 
    412 			n->dst[0] = sh->get_special_value(SV_AR_INDEX);
    413 			save_mova(n);
    414 
    415 			n->flags |= NF_DONT_HOIST;
    416 
    417 		} else if (n->bc.op_ptr->src_count == 3 || n->bc.write_mask) {
    418 			assert(!n->bc.dst_rel || n->bc.index_mode == INDEX_AR_X);
    419 
    420 			value *v = sh->get_gpr_value(false, n->bc.dst_gpr, n->bc.dst_chan,
    421 					n->bc.dst_rel);
    422 
    423 			n->dst[0] = v;
    424 		}
    425 
    426 		if (n->bc.pred_sel) {
    427 			sh->has_alu_predication = true;
    428 			n->pred = sh->get_special_value(SV_ALU_PRED);
    429 		}
    430 
    431 		for (unsigned s = 0; s < src_count; ++s) {
    432 			bc_alu_src &src = n->bc.src[s];
    433 
    434 			if (src.sel == ALU_SRC_LITERAL) {
    435 				n->src[s] = sh->get_const_value(src.value);
    436 			} else if (src.sel == ALU_SRC_PS || src.sel == ALU_SRC_PV) {
    437 				unsigned pgroup = !cgroup, prev_slot = src.sel == ALU_SRC_PS ?
    438 						SLOT_TRANS : src.chan;
    439 
    440 				// XXX shouldn't happen but llvm backend uses PS on cayman
    441 				if (prev_slot == SLOT_TRANS && ctx.is_cayman())
    442 					prev_slot = SLOT_X;
    443 
    444 				alu_node *prev_alu = slots[pgroup][prev_slot];
    445 
    446 				assert(prev_alu);
    447 
    448 				if (!prev_alu->dst[0]) {
    449 					value * t = sh->create_temp_value();
    450 					prev_alu->dst[0] = t;
    451 				}
    452 
    453 				value *d = prev_alu->dst[0];
    454 
    455 				if (d->is_rel()) {
    456 					d = sh->get_gpr_value(true, prev_alu->bc.dst_gpr,
    457 					                      prev_alu->bc.dst_chan,
    458 					                      prev_alu->bc.dst_rel);
    459 				}
    460 
    461 				n->src[s] = d;
    462 			} else if (ctx.is_kcache_sel(src.sel)) {
    463 				unsigned sel = src.sel, kc_addr;
    464 				unsigned kc_set = ((sel >> 7) & 2) + ((sel >> 5) & 1);
    465 
    466 				bc_kcache &kc = cf->bc.kc[kc_set];
    467 				kc_addr = (kc.addr << 4) + (sel & 0x1F);
    468 				n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan, (alu_kcache_index_mode)kc.index_mode);
    469 
    470 				if (kc.index_mode != KC_INDEX_NONE) {
    471 					assert(kc.index_mode != KC_LOCK_LOOP);
    472 					ubo_indexing[kc.index_mode - KC_INDEX_0] = true;
    473 				}
    474 			} else if (src.sel < MAX_GPR) {
    475 				value *v = sh->get_gpr_value(true, src.sel, src.chan, src.rel);
    476 
    477 				n->src[s] = v;
    478 
    479 			} else if (src.sel >= ALU_SRC_PARAM_OFFSET) {
    480 				// using slot for value channel because in fact the slot
    481 				// determines the channel that is loaded by INTERP_LOAD_P0
    482 				// (and maybe some others).
    483 				// otherwise GVN will consider INTERP_LOAD_P0s with the same
    484 				// param index as equal instructions and leave only one of them
    485 				n->src[s] = sh->get_special_ro_value(sel_chan(src.sel,
    486 				                                              n->bc.slot));
    487 			} else {
    488 				switch (src.sel) {
    489 				case ALU_SRC_0:
    490 					n->src[s] = sh->get_const_value(0);
    491 					break;
    492 				case ALU_SRC_0_5:
    493 					n->src[s] = sh->get_const_value(0.5f);
    494 					break;
    495 				case ALU_SRC_1:
    496 					n->src[s] = sh->get_const_value(1.0f);
    497 					break;
    498 				case ALU_SRC_1_INT:
    499 					n->src[s] = sh->get_const_value(1);
    500 					break;
    501 				case ALU_SRC_M_1_INT:
    502 					n->src[s] = sh->get_const_value(-1);
    503 					break;
    504 				default:
    505 					n->src[s] = sh->get_special_ro_value(src.sel);
    506 					break;
    507 				}
    508 			}
    509 		}
    510 
    511 		// add UBO index values if any as dependencies
    512 		if (ubo_indexing[0]) {
    513 			n->src.push_back(get_cf_index_value(0));
    514 		}
    515 		if (ubo_indexing[1]) {
    516 			n->src.push_back(get_cf_index_value(1));
    517 		}
    518 
    519 		if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) &&
    520 		    ctx.is_cayman())
    521 			// Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX
    522 			save_set_cf_index(n->src[0], n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1);
    523 	}
    524 
    525 	// pack multislot instructions into alu_packed_node
    526 
    527 	alu_packed_node *p = NULL;
    528 	for (node_iterator N, I = g->begin(), E = g->end(); I != E; I = N) {
    529 		N = I + 1;
    530 		alu_node *a = static_cast<alu_node*>(*I);
    531 		unsigned sflags = a->bc.slot_flags;
    532 
    533 		if (sflags == AF_4V || (ctx.is_cayman() && sflags == AF_S)) {
    534 			if (!p)
    535 				p = sh->create_alu_packed();
    536 
    537 			a->remove();
    538 			p->push_back(a);
    539 		}
    540 	}
    541 
    542 	if (p) {
    543 		g->push_front(p);
    544 
    545 		if (p->count() == 3 && ctx.is_cayman()) {
    546 			// cayman's scalar instruction that can use 3 or 4 slots
    547 
    548 			// FIXME for simplicity we'll always add 4th slot,
    549 			// but probably we might want to always remove 4th slot and make
    550 			// sure that regalloc won't choose 'w' component for dst
    551 
    552 			alu_node *f = static_cast<alu_node*>(p->first);
    553 			alu_node *a = sh->create_alu();
    554 			a->src = f->src;
    555 			a->dst.resize(f->dst.size());
    556 			a->bc = f->bc;
    557 			a->bc.slot = SLOT_W;
    558 			p->push_back(a);
    559 		}
    560 	}
    561 
    562 	return 0;
    563 }
    564 
    565 int bc_parser::decode_fetch_clause(cf_node* cf) {
    566 	int r;
    567 	unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1;
    568 
    569 	cf->subtype = NST_TEX_CLAUSE;
    570 
    571 	while (cnt--) {
    572 		fetch_node *n = sh->create_fetch();
    573 		cf->push_back(n);
    574 		if ((r = dec->decode_fetch(i, n->bc)))
    575 			return r;
    576 		if (n->bc.src_rel || n->bc.dst_rel)
    577 			gpr_reladdr = true;
    578 
    579 	}
    580 	return 0;
    581 }
    582 
    583 int bc_parser::prepare_fetch_clause(cf_node *cf) {
    584 
    585 	vvec grad_v, grad_h, texture_offsets;
    586 
    587 	for (node_iterator I = cf->begin(), E = cf->end(); I != E; ++I) {
    588 
    589 		fetch_node *n = static_cast<fetch_node*>(*I);
    590 		assert(n->is_valid());
    591 
    592 		unsigned flags = n->bc.op_ptr->flags;
    593 
    594 		unsigned vtx = flags & FF_VTX;
    595 		unsigned num_src = vtx ? ctx.vtx_src_num : 4;
    596 
    597 		n->dst.resize(4);
    598 
    599 		if (flags & (FF_SETGRAD | FF_USEGRAD | FF_GETGRAD)) {
    600 			sh->uses_gradients = true;
    601 		}
    602 
    603 		if (flags & (FF_SETGRAD | FF_SET_TEXTURE_OFFSETS)) {
    604 
    605 			vvec *grad = NULL;
    606 
    607 			switch (n->bc.op) {
    608 				case FETCH_OP_SET_GRADIENTS_V:
    609 					grad = &grad_v;
    610 					break;
    611 				case FETCH_OP_SET_GRADIENTS_H:
    612 					grad = &grad_h;
    613 					break;
    614 				case FETCH_OP_SET_TEXTURE_OFFSETS:
    615 					grad = &texture_offsets;
    616 					break;
    617 				default:
    618 					assert(!"unexpected SET_GRAD instruction");
    619 					return -1;
    620 			}
    621 
    622 			if (grad->empty())
    623 				grad->resize(4);
    624 
    625 			for(unsigned s = 0; s < 4; ++s) {
    626 				unsigned sw = n->bc.src_sel[s];
    627 				if (sw <= SEL_W)
    628 					(*grad)[s] = sh->get_gpr_value(true, n->bc.src_gpr,
    629 					                               sw, false);
    630 				else if (sw == SEL_0)
    631 					(*grad)[s] = sh->get_const_value(0.0f);
    632 				else if (sw == SEL_1)
    633 					(*grad)[s] = sh->get_const_value(1.0f);
    634 			}
    635 		} else {
    636 			// Fold source values for instructions with hidden target values in to the instructions
    637 			// using them. The set instructions are later re-emitted by bc_finalizer
    638 			if (flags & FF_USEGRAD) {
    639 				n->src.resize(12);
    640 				std::copy(grad_v.begin(), grad_v.end(), n->src.begin() + 4);
    641 				std::copy(grad_h.begin(), grad_h.end(), n->src.begin() + 8);
    642 			} else if (flags & FF_USE_TEXTURE_OFFSETS) {
    643 				n->src.resize(8);
    644 				std::copy(texture_offsets.begin(), texture_offsets.end(), n->src.begin() + 4);
    645 			} else {
    646 				n->src.resize(4);
    647 			}
    648 
    649 			for(int s = 0; s < 4; ++s) {
    650 				if (n->bc.dst_sel[s] != SEL_MASK)
    651 					n->dst[s] = sh->get_gpr_value(false, n->bc.dst_gpr, s, false);
    652 				// NOTE: it doesn't matter here which components of the result we
    653 				// are using, but original n->bc.dst_sel should be taken into
    654 				// account when building the bytecode
    655 			}
    656 			for(unsigned s = 0; s < num_src; ++s) {
    657 				if (n->bc.src_sel[s] <= SEL_W)
    658 					n->src[s] = sh->get_gpr_value(true, n->bc.src_gpr,
    659 					                              n->bc.src_sel[s], false);
    660 			}
    661 
    662 			// Scheduler will emit the appropriate instructions to set CF_IDX0/1
    663 			if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) {
    664 				n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1));
    665 			}
    666 			if (n->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
    667 				n->src.push_back(get_cf_index_value(n->bc.resource_index_mode == V_SQ_CF_INDEX_1));
    668 			}
    669 		}
    670 	}
    671 
    672 	return 0;
    673 }
    674 
    675 int bc_parser::prepare_ir() {
    676 
    677 	for(id_cf_map::iterator I = cf_map.begin(), E = cf_map.end(); I != E; ++I) {
    678 		cf_node *c = *I;
    679 
    680 		if (!c)
    681 			continue;
    682 
    683 		unsigned flags = c->bc.op_ptr->flags;
    684 
    685 		if (flags & CF_ALU) {
    686 			prepare_alu_clause(c);
    687 		} else if (flags & CF_FETCH) {
    688 			prepare_fetch_clause(c);
    689 		} else if (c->bc.op == CF_OP_CALL_FS) {
    690 			sh->init_call_fs(c);
    691 			c->flags |= NF_SCHEDULE_EARLY | NF_DONT_MOVE;
    692 		} else if (flags & CF_LOOP_START) {
    693 			prepare_loop(c);
    694 		} else if (c->bc.op == CF_OP_JUMP) {
    695 			prepare_if(c);
    696 		} else if (c->bc.op == CF_OP_LOOP_END) {
    697 			loop_stack.pop();
    698 		} else if (c->bc.op == CF_OP_LOOP_CONTINUE) {
    699 			assert(!loop_stack.empty());
    700 			repeat_node *rep = sh->create_repeat(loop_stack.top());
    701 			if (c->parent->first != c)
    702 				rep->move(c->parent->first, c);
    703 			c->replace_with(rep);
    704 			sh->simplify_dep_rep(rep);
    705 		} else if (c->bc.op == CF_OP_LOOP_BREAK) {
    706 			assert(!loop_stack.empty());
    707 			depart_node *dep = sh->create_depart(loop_stack.top());
    708 			if (c->parent->first != c)
    709 				dep->move(c->parent->first, c);
    710 			c->replace_with(dep);
    711 			sh->simplify_dep_rep(dep);
    712 		} else if (flags & CF_EXP) {
    713 
    714 			// unroll burst exports
    715 
    716 			assert(c->bc.op == CF_OP_EXPORT || c->bc.op == CF_OP_EXPORT_DONE);
    717 
    718 			c->bc.set_op(CF_OP_EXPORT);
    719 
    720 			unsigned burst_count = c->bc.burst_count;
    721 			unsigned eop = c->bc.end_of_program;
    722 
    723 			c->bc.end_of_program = 0;
    724 			c->bc.burst_count = 0;
    725 
    726 			do {
    727 				c->src.resize(4);
    728 
    729 				for(int s = 0; s < 4; ++s) {
    730 					switch (c->bc.sel[s]) {
    731 					case SEL_0:
    732 						c->src[s] = sh->get_const_value(0.0f);
    733 						break;
    734 					case SEL_1:
    735 						c->src[s] = sh->get_const_value(1.0f);
    736 						break;
    737 					case SEL_MASK:
    738 						break;
    739 					default:
    740 						if (c->bc.sel[s] <= SEL_W)
    741 							c->src[s] = sh->get_gpr_value(true, c->bc.rw_gpr,
    742 									c->bc.sel[s], false);
    743 						else
    744 							assert(!"invalid src_sel for export");
    745 					}
    746 				}
    747 
    748 				if (!burst_count--)
    749 					break;
    750 
    751 				cf_node *cf_next = sh->create_cf();
    752 				cf_next->bc = c->bc;
    753 				++cf_next->bc.rw_gpr;
    754 				++cf_next->bc.array_base;
    755 
    756 				c->insert_after(cf_next);
    757 				c = cf_next;
    758 
    759 			} while (1);
    760 
    761 			c->bc.end_of_program = eop;
    762 		} else if (flags & CF_MEM) {
    763 
    764 			unsigned burst_count = c->bc.burst_count;
    765 			unsigned eop = c->bc.end_of_program;
    766 
    767 			c->bc.end_of_program = 0;
    768 			c->bc.burst_count = 0;
    769 
    770 			do {
    771 
    772 				c->src.resize(4);
    773 
    774 				for(int s = 0; s < 4; ++s) {
    775 					if (c->bc.comp_mask & (1 << s))
    776 						c->src[s] =
    777 								sh->get_gpr_value(true, c->bc.rw_gpr, s, false);
    778 				}
    779 
    780 				if (((flags & CF_RAT) || (!(flags & CF_STRM))) && (c->bc.type & 1)) { // indexed write
    781 					c->src.resize(8);
    782 					for(int s = 0; s < 3; ++s) {
    783 						c->src[4 + s] =
    784 							sh->get_gpr_value(true, c->bc.index_gpr, s, false);
    785 					}
    786 
    787 					// FIXME probably we can relax it a bit
    788 					c->flags |= NF_DONT_HOIST | NF_DONT_MOVE;
    789 				}
    790 
    791 				if (flags & CF_EMIT) {
    792 					// Instruction implicitly depends on prior [EMIT_][CUT]_VERTEX
    793 					c->src.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
    794 					c->dst.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
    795 					if (sh->target == TARGET_ES) {
    796 						// For ES shaders this is an export
    797 						c->flags |= NF_DONT_KILL;
    798 					}
    799 				}
    800 
    801 				if (!burst_count--)
    802 					break;
    803 
    804 				cf_node *cf_next = sh->create_cf();
    805 				cf_next->bc = c->bc;
    806 				++cf_next->bc.rw_gpr;
    807 
    808 				// FIXME is it correct?
    809 				cf_next->bc.array_base += cf_next->bc.elem_size + 1;
    810 
    811 				c->insert_after(cf_next);
    812 				c = cf_next;
    813 			} while (1);
    814 
    815 			c->bc.end_of_program = eop;
    816 
    817 		} else if (flags & CF_EMIT) {
    818 			/* quick peephole */
    819 			cf_node *prev = static_cast<cf_node *>(c->prev);
    820 			if (c->bc.op == CF_OP_CUT_VERTEX &&
    821 				prev && prev->is_valid() &&
    822 				prev->bc.op == CF_OP_EMIT_VERTEX &&
    823 				c->bc.count == prev->bc.count) {
    824 				prev->bc.set_op(CF_OP_EMIT_CUT_VERTEX);
    825 				prev->bc.end_of_program = c->bc.end_of_program;
    826 				c->remove();
    827 			}
    828 			else {
    829 				c->flags |= NF_DONT_KILL | NF_DONT_HOIST | NF_DONT_MOVE;
    830 
    831 				c->src.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
    832 				c->dst.push_back(sh->get_special_value(SV_GEOMETRY_EMIT));
    833 			}
    834 		}
    835 	}
    836 
    837 	assert(loop_stack.empty());
    838 	return 0;
    839 }
    840 
    841 int bc_parser::prepare_loop(cf_node* c) {
    842 	assert(c->bc.addr-1 < cf_map.size());
    843 
    844 	cf_node *end = cf_map[c->bc.addr - 1];
    845 	assert(end->bc.op == CF_OP_LOOP_END);
    846 	assert(c->parent == end->parent);
    847 
    848 	region_node *reg = sh->create_region();
    849 	repeat_node *rep = sh->create_repeat(reg);
    850 
    851 	reg->push_back(rep);
    852 	c->insert_before(reg);
    853 	rep->move(c, end->next);
    854 
    855 	reg->src_loop = true;
    856 
    857 	loop_stack.push(reg);
    858 	return 0;
    859 }
    860 
    861 int bc_parser::prepare_if(cf_node* c) {
    862 	assert(c->bc.addr-1 < cf_map.size());
    863 	cf_node *c_else = NULL, *end = cf_map[c->bc.addr];
    864 
    865 	if (!end)
    866 		return 0; // not quite sure how this happens, malformed input?
    867 
    868 	BCP_DUMP(
    869 		sblog << "parsing JUMP @" << c->bc.id;
    870 		sblog << "\n";
    871 	);
    872 
    873 	if (end->bc.op == CF_OP_ELSE) {
    874 		BCP_DUMP(
    875 			sblog << "  found ELSE : ";
    876 			dump::dump_op(end);
    877 			sblog << "\n";
    878 		);
    879 
    880 		c_else = end;
    881 		end = cf_map[c_else->bc.addr];
    882 	} else {
    883 		BCP_DUMP(
    884 			sblog << "  no else\n";
    885 		);
    886 
    887 		c_else = end;
    888 	}
    889 
    890 	if (c_else->parent != c->parent)
    891 		c_else = NULL;
    892 
    893 	if (end && end->parent != c->parent)
    894 		end = NULL;
    895 
    896 	region_node *reg = sh->create_region();
    897 
    898 	depart_node *dep2 = sh->create_depart(reg);
    899 	depart_node *dep = sh->create_depart(reg);
    900 	if_node *n_if = sh->create_if();
    901 
    902 	c->insert_before(reg);
    903 
    904 	if (c_else != end)
    905 		dep->move(c_else, end);
    906 	dep2->move(c, end);
    907 
    908 	reg->push_back(dep);
    909 	dep->push_front(n_if);
    910 	n_if->push_back(dep2);
    911 
    912 	n_if->cond = sh->get_special_value(SV_EXEC_MASK);
    913 
    914 	return 0;
    915 }
    916 
    917 
    918 } // namespace r600_sb
    919