Home | History | Annotate | Download | only in r600
      1 /*
      2  * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  */
     23 #include "r600_sq.h"
     24 #include "r600_llvm.h"
     25 #include "r600_formats.h"
     26 #include "r600_opcodes.h"
     27 #include "r600d.h"
     28 
     29 #include "pipe/p_shader_tokens.h"
     30 #include "tgsi/tgsi_info.h"
     31 #include "tgsi/tgsi_parse.h"
     32 #include "tgsi/tgsi_scan.h"
     33 #include "tgsi/tgsi_dump.h"
     34 #include "util/u_memory.h"
     35 #include <stdio.h>
     36 #include <errno.h>
     37 #include <byteswap.h>
     38 
     39 /* CAYMAN notes
     40 Why CAYMAN got loops for lots of instructions is explained here.
     41 
     42 -These 8xx t-slot only ops are implemented in all vector slots.
     43 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
     44 These 8xx t-slot only opcodes become vector ops, with all four
     45 slots expecting the arguments on sources a and b. Result is
     46 broadcast to all channels.
     47 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
     48 These 8xx t-slot only opcodes become vector ops in the z, y, and
     49 x slots.
     50 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
     51 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
     52 SQRT_IEEE/_64
     53 SIN/COS
     54 The w slot may have an independent co-issued operation, or if the
     55 result is required to be in the w slot, the opcode above may be
     56 issued in the w slot as well.
     57 The compiler must issue the source argument to slots z, y, and x
     58 */
     59 
     60 static int r600_pipe_shader(struct pipe_context *ctx, struct r600_pipe_shader *shader)
     61 {
     62 	struct r600_context *rctx = (struct r600_context *)ctx;
     63 	struct r600_shader *rshader = &shader->shader;
     64 	uint32_t *ptr;
     65 	int	i;
     66 
     67 	/* copy new shader */
     68 	if (shader->bo == NULL) {
     69 		shader->bo = (struct r600_resource*)
     70 			pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, rshader->bc.ndw * 4);
     71 		if (shader->bo == NULL) {
     72 			return -ENOMEM;
     73 		}
     74 		ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->cs_buf, rctx->cs, PIPE_TRANSFER_WRITE);
     75 		if (R600_BIG_ENDIAN) {
     76 			for (i = 0; i < rshader->bc.ndw; ++i) {
     77 				ptr[i] = bswap_32(rshader->bc.bytecode[i]);
     78 			}
     79 		} else {
     80 			memcpy(ptr, rshader->bc.bytecode, rshader->bc.ndw * sizeof(*ptr));
     81 		}
     82 		rctx->ws->buffer_unmap(shader->bo->cs_buf);
     83 	}
     84 	/* build state */
     85 	switch (rshader->processor_type) {
     86 	case TGSI_PROCESSOR_VERTEX:
     87 		if (rctx->chip_class >= EVERGREEN) {
     88 			evergreen_pipe_shader_vs(ctx, shader);
     89 		} else {
     90 			r600_pipe_shader_vs(ctx, shader);
     91 		}
     92 		break;
     93 	case TGSI_PROCESSOR_FRAGMENT:
     94 		if (rctx->chip_class >= EVERGREEN) {
     95 			evergreen_pipe_shader_ps(ctx, shader);
     96 		} else {
     97 			r600_pipe_shader_ps(ctx, shader);
     98 		}
     99 		break;
    100 	default:
    101 		return -EINVAL;
    102 	}
    103 	return 0;
    104 }
    105 
    106 static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader);
    107 
    108 int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader)
    109 {
    110 	static int dump_shaders = -1;
    111 	struct r600_context *rctx = (struct r600_context *)ctx;
    112 	struct r600_pipe_shader_selector *sel = shader->selector;
    113 	int r;
    114 
    115 	/* Would like some magic "get_bool_option_once" routine.
    116 	*/
    117 	if (dump_shaders == -1)
    118 		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
    119 
    120 	if (dump_shaders) {
    121 		fprintf(stderr, "--------------------------------------------------------------\n");
    122 		tgsi_dump(sel->tokens, 0);
    123 
    124 		if (sel->so.num_outputs) {
    125 			unsigned i;
    126 			fprintf(stderr, "STREAMOUT\n");
    127 			for (i = 0; i < sel->so.num_outputs; i++) {
    128 				unsigned mask = ((1 << sel->so.output[i].num_components) - 1) <<
    129 						sel->so.output[i].start_component;
    130 				fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
    131 					sel->so.output[i].output_buffer, sel->so.output[i].register_index,
    132 				        mask & 1 ? "x" : "_",
    133 				        (mask >> 1) & 1 ? "y" : "_",
    134 				        (mask >> 2) & 1 ? "z" : "_",
    135 				        (mask >> 3) & 1 ? "w" : "_");
    136 			}
    137 		}
    138 	}
    139 	r = r600_shader_from_tgsi(rctx, shader);
    140 	if (r) {
    141 		R600_ERR("translation from TGSI failed !\n");
    142 		return r;
    143 	}
    144 	r = r600_bytecode_build(&shader->shader.bc);
    145 	if (r) {
    146 		R600_ERR("building bytecode failed !\n");
    147 		return r;
    148 	}
    149 	if (dump_shaders) {
    150 		r600_bytecode_dump(&shader->shader.bc);
    151 		fprintf(stderr, "______________________________________________________________\n");
    152 	}
    153 	return r600_pipe_shader(ctx, shader);
    154 }
    155 
    156 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
    157 {
    158 	pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
    159 	r600_bytecode_clear(&shader->shader.bc);
    160 }
    161 
    162 /*
    163  * tgsi -> r600 shader
    164  */
    165 struct r600_shader_tgsi_instruction;
    166 
    167 struct r600_shader_src {
    168 	unsigned				sel;
    169 	unsigned				swizzle[4];
    170 	unsigned				neg;
    171 	unsigned				abs;
    172 	unsigned				rel;
    173 	uint32_t				value[4];
    174 };
    175 
    176 struct r600_shader_ctx {
    177 	struct tgsi_shader_info			info;
    178 	struct tgsi_parse_context		parse;
    179 	const struct tgsi_token			*tokens;
    180 	unsigned				type;
    181 	unsigned				file_offset[TGSI_FILE_COUNT];
    182 	unsigned				temp_reg;
    183 	struct r600_shader_tgsi_instruction	*inst_info;
    184 	struct r600_bytecode			*bc;
    185 	struct r600_shader			*shader;
    186 	struct r600_shader_src			src[4];
    187 	uint32_t				*literals;
    188 	uint32_t				nliterals;
    189 	uint32_t				max_driver_temp_used;
    190 	/* needed for evergreen interpolation */
    191 	boolean                                 input_centroid;
    192 	boolean                                 input_linear;
    193 	boolean                                 input_perspective;
    194 	int					num_interp_gpr;
    195 	int					face_gpr;
    196 	int					colors_used;
    197 	boolean                 clip_vertex_write;
    198 	unsigned                cv_output;
    199 	int					fragcoord_input;
    200 	int					native_integers;
    201 };
    202 
    203 struct r600_shader_tgsi_instruction {
    204 	unsigned	tgsi_opcode;
    205 	unsigned	is_op3;
    206 	unsigned	r600_opcode;
    207 	int (*process)(struct r600_shader_ctx *ctx);
    208 };
    209 
    210 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
    211 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
    212 static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only);
    213 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
    214 static int tgsi_else(struct r600_shader_ctx *ctx);
    215 static int tgsi_endif(struct r600_shader_ctx *ctx);
    216 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
    217 static int tgsi_endloop(struct r600_shader_ctx *ctx);
    218 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
    219 
    220 /*
    221  * bytestream -> r600 shader
    222  *
    223  * These functions are used to transform the output of the LLVM backend into
    224  * struct r600_bytecode.
    225  */
    226 
    227 static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
    228 				unsigned char * bytes,	unsigned num_bytes);
    229 
    230 #ifdef HAVE_OPENCL
    231 int r600_compute_shader_create(struct pipe_context * ctx,
    232 	LLVMModuleRef mod,  struct r600_bytecode * bytecode)
    233 {
    234 	struct r600_context *r600_ctx = (struct r600_context *)ctx;
    235 	unsigned char * bytes;
    236 	unsigned byte_count;
    237 	struct r600_shader_ctx shader_ctx;
    238 	unsigned dump = 0;
    239 
    240 	if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
    241 		dump = 1;
    242 	}
    243 
    244 	r600_llvm_compile(mod, &bytes, &byte_count, r600_ctx->family , dump);
    245 	shader_ctx.bc = bytecode;
    246 	r600_bytecode_init(shader_ctx.bc, r600_ctx->chip_class, r600_ctx->family);
    247 	shader_ctx.bc->type = TGSI_PROCESSOR_COMPUTE;
    248 	r600_bytecode_from_byte_stream(&shader_ctx, bytes, byte_count);
    249 	if (shader_ctx.bc->chip_class == CAYMAN) {
    250 		cm_bytecode_add_cf_end(shader_ctx.bc);
    251 	}
    252 	r600_bytecode_build(shader_ctx.bc);
    253 	if (dump) {
    254 		r600_bytecode_dump(shader_ctx.bc);
    255 	}
    256 	return 1;
    257 }
    258 
    259 #endif /* HAVE_OPENCL */
    260 
    261 static uint32_t i32_from_byte_stream(unsigned char * bytes,
    262 		unsigned * bytes_read)
    263 {
    264 	unsigned i;
    265 	uint32_t out = 0;
    266 	for (i = 0; i < 4; i++) {
    267 		out |= bytes[(*bytes_read)++] << (8 * i);
    268 	}
    269 	return out;
    270 }
    271 
    272 static unsigned r600_src_from_byte_stream(unsigned char * bytes,
    273 		unsigned bytes_read, struct r600_bytecode_alu * alu, unsigned src_idx)
    274 {
    275 	unsigned i;
    276 	unsigned sel0, sel1;
    277 	sel0 = bytes[bytes_read++];
    278 	sel1 = bytes[bytes_read++];
    279 	alu->src[src_idx].sel = sel0 | (sel1 << 8);
    280 	alu->src[src_idx].chan = bytes[bytes_read++];
    281 	alu->src[src_idx].neg = bytes[bytes_read++];
    282 	alu->src[src_idx].abs = bytes[bytes_read++];
    283 	alu->src[src_idx].rel = bytes[bytes_read++];
    284 	alu->src[src_idx].kc_bank = bytes[bytes_read++];
    285 	for (i = 0; i < 4; i++) {
    286 		alu->src[src_idx].value |= bytes[bytes_read++] << (i * 8);
    287 	}
    288 	return bytes_read;
    289 }
    290 
    291 static unsigned r600_alu_from_byte_stream(struct r600_shader_ctx *ctx,
    292 				unsigned char * bytes, unsigned bytes_read)
    293 {
    294 	unsigned src_idx;
    295 	unsigned inst0, inst1;
    296 	unsigned push_modifier;
    297 	struct r600_bytecode_alu alu;
    298 	memset(&alu, 0, sizeof(alu));
    299 	for(src_idx = 0; src_idx < 3; src_idx++) {
    300 		bytes_read = r600_src_from_byte_stream(bytes, bytes_read,
    301 								&alu, src_idx);
    302 	}
    303 
    304 	alu.dst.sel = bytes[bytes_read++];
    305 	alu.dst.chan = bytes[bytes_read++];
    306 	alu.dst.clamp = bytes[bytes_read++];
    307 	alu.dst.write = bytes[bytes_read++];
    308 	alu.dst.rel = bytes[bytes_read++];
    309 	inst0 = bytes[bytes_read++];
    310 	inst1 = bytes[bytes_read++];
    311 	alu.inst = inst0 | (inst1 << 8);
    312 	alu.last = bytes[bytes_read++];
    313 	alu.is_op3 = bytes[bytes_read++];
    314 	push_modifier = bytes[bytes_read++];
    315 	alu.pred_sel = bytes[bytes_read++];
    316 	alu.bank_swizzle = bytes[bytes_read++];
    317 	alu.bank_swizzle_force = bytes[bytes_read++];
    318 	alu.omod = bytes[bytes_read++];
    319 	alu.index_mode = bytes[bytes_read++];
    320 
    321 
    322 	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE) ||
    323 	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE) ||
    324 	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT) ||
    325 	    alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT)) {
    326 		alu.update_pred = 1;
    327 		alu.dst.write = 0;
    328 		alu.src[1].sel = V_SQ_ALU_SRC_0;
    329 		alu.src[1].chan = 0;
    330 		alu.last = 1;
    331     }
    332 
    333     if (push_modifier) {
    334         alu.pred_sel = 0;
    335 		alu.execute_mask = 1;
    336 		r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
    337 	} else
    338 		r600_bytecode_add_alu(ctx->bc, &alu);
    339 
    340 
    341 	/* XXX: Handle other KILL instructions */
    342 	if (alu.inst == CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT)) {
    343 		ctx->shader->uses_kill = 1;
    344 		/* XXX: This should be enforced in the LLVM backend. */
    345 		ctx->bc->force_add_cf = 1;
    346 	}
    347 	return bytes_read;
    348 }
    349 
    350 static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
    351 	unsigned pred_inst)
    352 {
    353 	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
    354 	fc_pushlevel(ctx, FC_IF);
    355 	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
    356 }
    357 
    358 static void r600_break_from_byte_stream(struct r600_shader_ctx *ctx,
    359 			struct r600_bytecode_alu *alu, unsigned compare_opcode)
    360 {
    361 	unsigned opcode = TGSI_OPCODE_BRK;
    362 	if (ctx->bc->chip_class == CAYMAN)
    363 		ctx->inst_info = &cm_shader_tgsi_instruction[opcode];
    364 	else if (ctx->bc->chip_class >= EVERGREEN)
    365 		ctx->inst_info = &eg_shader_tgsi_instruction[opcode];
    366 	else
    367 		ctx->inst_info = &r600_shader_tgsi_instruction[opcode];
    368 	llvm_if(ctx, alu, compare_opcode);
    369 	tgsi_loop_brk_cont(ctx);
    370 	tgsi_endif(ctx);
    371 }
    372 
    373 static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
    374 				unsigned char * bytes, unsigned bytes_read)
    375 {
    376 	struct r600_bytecode_alu alu;
    377 	unsigned inst;
    378 	memset(&alu, 0, sizeof(alu));
    379 	bytes_read = r600_src_from_byte_stream(bytes, bytes_read, &alu, 0);
    380 	inst = bytes[bytes_read++];
    381 	switch (inst) {
    382 	case 0: /* FC_IF */
    383 		llvm_if(ctx, &alu,
    384 			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
    385 		break;
    386 	case 1: /* FC_IF_INT */
    387 		llvm_if(ctx, &alu,
    388 			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
    389 		break;
    390 	case 2: /* FC_ELSE */
    391 		tgsi_else(ctx);
    392 		break;
    393 	case 3: /* FC_ENDIF */
    394 		tgsi_endif(ctx);
    395 		break;
    396 	case 4: /* FC_BGNLOOP */
    397 		tgsi_bgnloop(ctx);
    398 		break;
    399 	case 5: /* FC_ENDLOOP */
    400 		tgsi_endloop(ctx);
    401 		break;
    402 	case 6: /* FC_BREAK */
    403 		r600_break_from_byte_stream(ctx, &alu,
    404 			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
    405 		break;
    406 	case 7: /* FC_BREAK_NZ_INT */
    407 		r600_break_from_byte_stream(ctx, &alu,
    408 			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
    409 		break;
    410 	case 8: /* FC_CONTINUE */
    411 		{
    412 			unsigned opcode = TGSI_OPCODE_CONT;
    413 			if (ctx->bc->chip_class == CAYMAN) {
    414 				ctx->inst_info =
    415 					&cm_shader_tgsi_instruction[opcode];
    416 			} else if (ctx->bc->chip_class >= EVERGREEN) {
    417 				ctx->inst_info =
    418 					&eg_shader_tgsi_instruction[opcode];
    419 			} else {
    420 				ctx->inst_info =
    421 					&r600_shader_tgsi_instruction[opcode];
    422 			}
    423 			tgsi_loop_brk_cont(ctx);
    424 		}
    425 		break;
    426 	case 9: /* FC_BREAK_Z_INT */
    427 		r600_break_from_byte_stream(ctx, &alu,
    428 			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
    429 		break;
    430 	case 10: /* FC_BREAK_NZ */
    431 		r600_break_from_byte_stream(ctx, &alu,
    432 			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE));
    433 		break;
    434 	}
    435 
    436 	return bytes_read;
    437 }
    438 
    439 static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
    440 				unsigned char * bytes, unsigned bytes_read)
    441 {
    442 	struct r600_bytecode_tex tex;
    443 
    444 	tex.inst = bytes[bytes_read++];
    445 	tex.resource_id = bytes[bytes_read++];
    446 	tex.src_gpr = bytes[bytes_read++];
    447 	tex.src_rel = bytes[bytes_read++];
    448 	tex.dst_gpr = bytes[bytes_read++];
    449 	tex.dst_rel = bytes[bytes_read++];
    450 	tex.dst_sel_x = bytes[bytes_read++];
    451 	tex.dst_sel_y = bytes[bytes_read++];
    452 	tex.dst_sel_z = bytes[bytes_read++];
    453 	tex.dst_sel_w = bytes[bytes_read++];
    454 	tex.lod_bias = bytes[bytes_read++];
    455 	tex.coord_type_x = bytes[bytes_read++];
    456 	tex.coord_type_y = bytes[bytes_read++];
    457 	tex.coord_type_z = bytes[bytes_read++];
    458 	tex.coord_type_w = bytes[bytes_read++];
    459 	tex.offset_x = bytes[bytes_read++];
    460 	tex.offset_y = bytes[bytes_read++];
    461 	tex.offset_z = bytes[bytes_read++];
    462 	tex.sampler_id = bytes[bytes_read++];
    463 	tex.src_sel_x = bytes[bytes_read++];
    464 	tex.src_sel_y = bytes[bytes_read++];
    465 	tex.src_sel_z = bytes[bytes_read++];
    466 	tex.src_sel_w = bytes[bytes_read++];
    467 
    468 	r600_bytecode_add_tex(ctx->bc, &tex);
    469 
    470 	return bytes_read;
    471 }
    472 
    473 static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
    474 	unsigned char * bytes, unsigned bytes_read)
    475 {
    476 	struct r600_bytecode_vtx vtx;
    477 
    478 	uint32_t word0 = i32_from_byte_stream(bytes, &bytes_read);
    479         uint32_t word1 = i32_from_byte_stream(bytes, &bytes_read);
    480 	uint32_t word2 = i32_from_byte_stream(bytes, &bytes_read);
    481 
    482 	memset(&vtx, 0, sizeof(vtx));
    483 
    484 	/* WORD0 */
    485 	vtx.inst = G_SQ_VTX_WORD0_VTX_INST(word0);
    486 	vtx.fetch_type = G_SQ_VTX_WORD0_FETCH_TYPE(word0);
    487 	vtx.buffer_id = G_SQ_VTX_WORD0_BUFFER_ID(word0);
    488 	vtx.src_gpr = G_SQ_VTX_WORD0_SRC_GPR(word0);
    489 	vtx.src_sel_x = G_SQ_VTX_WORD0_SRC_SEL_X(word0);
    490 	vtx.mega_fetch_count = G_SQ_VTX_WORD0_MEGA_FETCH_COUNT(word0);
    491 
    492 	/* WORD1 */
    493 	vtx.dst_gpr = G_SQ_VTX_WORD1_GPR_DST_GPR(word1);
    494 	vtx.dst_sel_x = G_SQ_VTX_WORD1_DST_SEL_X(word1);
    495 	vtx.dst_sel_y = G_SQ_VTX_WORD1_DST_SEL_Y(word1);
    496 	vtx.dst_sel_z = G_SQ_VTX_WORD1_DST_SEL_Z(word1);
    497 	vtx.dst_sel_w = G_SQ_VTX_WORD1_DST_SEL_W(word1);
    498 	vtx.use_const_fields = G_SQ_VTX_WORD1_USE_CONST_FIELDS(word1);
    499 	vtx.data_format = G_SQ_VTX_WORD1_DATA_FORMAT(word1);
    500 	vtx.num_format_all = G_SQ_VTX_WORD1_NUM_FORMAT_ALL(word1);
    501 	vtx.format_comp_all = G_SQ_VTX_WORD1_FORMAT_COMP_ALL(word1);
    502 	vtx.srf_mode_all = G_SQ_VTX_WORD1_SRF_MODE_ALL(word1);
    503 
    504 	/* WORD 2*/
    505 	vtx.offset = G_SQ_VTX_WORD2_OFFSET(word2);
    506 	vtx.endian = G_SQ_VTX_WORD2_ENDIAN_SWAP(word2);
    507 
    508 	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
    509 		fprintf(stderr, "Error adding vtx\n");
    510 	}
    511 	/* Use the Texture Cache */
    512 	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
    513 	return bytes_read;
    514 }
    515 
    516 static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
    517 				unsigned char * bytes,	unsigned num_bytes)
    518 {
    519 	unsigned bytes_read = 0;
    520 	unsigned i, byte;
    521 	while (bytes_read < num_bytes) {
    522 		char inst_type = bytes[bytes_read++];
    523 		switch (inst_type) {
    524 		case 0:
    525 			bytes_read = r600_alu_from_byte_stream(ctx, bytes,
    526 								bytes_read);
    527 			break;
    528 		case 1:
    529 			bytes_read = r600_tex_from_byte_stream(ctx, bytes,
    530 								bytes_read);
    531 			break;
    532 		case 2:
    533 			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
    534 								bytes_read);
    535 			break;
    536 		case 3:
    537 			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
    538 			for (i = 0; i < 2; i++) {
    539 				for (byte = 0 ; byte < 4; byte++) {
    540 					ctx->bc->cf_last->isa[i] |=
    541 					(bytes[bytes_read++] << (byte * 8));
    542 				}
    543 			}
    544 			break;
    545 
    546 		case 4:
    547 			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
    548 								bytes_read);
    549 			break;
    550 		default:
    551 			/* XXX: Error here */
    552 			break;
    553 		}
    554 	}
    555 }
    556 
    557 /* End bytestream -> r600 shader functions*/
    558 
    559 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
    560 {
    561 	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
    562 	int j;
    563 
    564 	if (i->Instruction.NumDstRegs > 1) {
    565 		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
    566 		return -EINVAL;
    567 	}
    568 	if (i->Instruction.Predicate) {
    569 		R600_ERR("predicate unsupported\n");
    570 		return -EINVAL;
    571 	}
    572 #if 0
    573 	if (i->Instruction.Label) {
    574 		R600_ERR("label unsupported\n");
    575 		return -EINVAL;
    576 	}
    577 #endif
    578 	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
    579 		if (i->Src[j].Register.Dimension) {
    580 			R600_ERR("unsupported src %d (dimension %d)\n", j,
    581 				 i->Src[j].Register.Dimension);
    582 			return -EINVAL;
    583 		}
    584 	}
    585 	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
    586 		if (i->Dst[j].Register.Dimension) {
    587 			R600_ERR("unsupported dst (dimension)\n");
    588 			return -EINVAL;
    589 		}
    590 	}
    591 	return 0;
    592 }
    593 
    594 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
    595 {
    596 	int i, r;
    597 	struct r600_bytecode_alu alu;
    598 	int gpr = 0, base_chan = 0;
    599 	int ij_index = 0;
    600 
    601 	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
    602 		ij_index = 0;
    603 		if (ctx->shader->input[input].centroid)
    604 			ij_index++;
    605 	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
    606 		ij_index = 0;
    607 		/* if we have perspective add one */
    608 		if (ctx->input_perspective)  {
    609 			ij_index++;
    610 			/* if we have perspective centroid */
    611 			if (ctx->input_centroid)
    612 				ij_index++;
    613 		}
    614 		if (ctx->shader->input[input].centroid)
    615 			ij_index++;
    616 	}
    617 
    618 	/* work out gpr and base_chan from index */
    619 	gpr = ij_index / 2;
    620 	base_chan = (2 * (ij_index % 2)) + 1;
    621 
    622 	for (i = 0; i < 8; i++) {
    623 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    624 
    625 		if (i < 4)
    626 			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_ZW;
    627 		else
    628 			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_XY;
    629 
    630 		if ((i > 1) && (i < 6)) {
    631 			alu.dst.sel = ctx->shader->input[input].gpr;
    632 			alu.dst.write = 1;
    633 		}
    634 
    635 		alu.dst.chan = i % 4;
    636 
    637 		alu.src[0].sel = gpr;
    638 		alu.src[0].chan = (base_chan - (i % 2));
    639 
    640 		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
    641 
    642 		alu.bank_swizzle_force = SQ_ALU_VEC_210;
    643 		if ((i % 4) == 3)
    644 			alu.last = 1;
    645 		r = r600_bytecode_add_alu(ctx->bc, &alu);
    646 		if (r)
    647 			return r;
    648 	}
    649 	return 0;
    650 }
    651 
    652 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
    653 {
    654 	int i, r;
    655 	struct r600_bytecode_alu alu;
    656 
    657 	for (i = 0; i < 4; i++) {
    658 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    659 
    660 		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INTERP_LOAD_P0;
    661 
    662 		alu.dst.sel = ctx->shader->input[input].gpr;
    663 		alu.dst.write = 1;
    664 
    665 		alu.dst.chan = i;
    666 
    667 		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
    668 		alu.src[0].chan = i;
    669 
    670 		if (i == 3)
    671 			alu.last = 1;
    672 		r = r600_bytecode_add_alu(ctx->bc, &alu);
    673 		if (r)
    674 			return r;
    675 	}
    676 	return 0;
    677 }
    678 
    679 /*
    680  * Special export handling in shaders
    681  *
    682  * shader export ARRAY_BASE for EXPORT_POS:
    683  * 60 is position
    684  * 61 is misc vector
    685  * 62, 63 are clip distance vectors
    686  *
    687  * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
    688  * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
    689  * USE_VTX_POINT_SIZE - point size in the X channel of export 61
    690  * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
    691  * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
    692  * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
    693  * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
    694  * exclusive from render target index)
    695  * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
    696  *
    697  *
    698  * shader export ARRAY_BASE for EXPORT_PIXEL:
    699  * 0-7 CB targets
    700  * 61 computed Z vector
    701  *
    702  * The use of the values exported in the computed Z vector are controlled
    703  * by DB_SHADER_CONTROL:
    704  * Z_EXPORT_ENABLE - Z as a float in RED
    705  * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
    706  * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
    707  * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
    708  * DB_SOURCE_FORMAT - export control restrictions
    709  *
    710  */
    711 
    712 
    713 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
    714 static int r600_spi_sid(struct r600_shader_io * io)
    715 {
    716 	int index, name = io->name;
    717 
    718 	/* These params are handled differently, they don't need
    719 	 * semantic indices, so we'll use 0 for them.
    720 	 */
    721 	if (name == TGSI_SEMANTIC_POSITION ||
    722 		name == TGSI_SEMANTIC_PSIZE ||
    723 		name == TGSI_SEMANTIC_FACE)
    724 		index = 0;
    725 	else {
    726 		if (name == TGSI_SEMANTIC_GENERIC) {
    727 			/* For generic params simply use sid from tgsi */
    728 			index = io->sid;
    729 		} else {
    730 			/* For non-generic params - pack name and sid into 8 bits */
    731 			index = 0x80 | (name<<3) | (io->sid);
    732 		}
    733 
    734 		/* Make sure that all really used indices have nonzero value, so
    735 		 * we can just compare it to 0 later instead of comparing the name
    736 		 * with different values to detect special cases. */
    737 		index++;
    738 	}
    739 
    740 	return index;
    741 };
    742 
    743 /* turn input into interpolate on EG */
    744 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
    745 {
    746 	int r = 0;
    747 
    748 	if (ctx->shader->input[index].spi_sid) {
    749 		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
    750 		if (ctx->shader->input[index].interpolate > 0) {
    751 			r = evergreen_interp_alu(ctx, index);
    752 		} else {
    753 			r = evergreen_interp_flat(ctx, index);
    754 		}
    755 	}
    756 	return r;
    757 }
    758 
    759 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
    760 {
    761 	struct r600_bytecode_alu alu;
    762 	int i, r;
    763 	int gpr_front = ctx->shader->input[front].gpr;
    764 	int gpr_back = ctx->shader->input[back].gpr;
    765 
    766 	for (i = 0; i < 4; i++) {
    767 		memset(&alu, 0, sizeof(alu));
    768 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
    769 		alu.is_op3 = 1;
    770 		alu.dst.write = 1;
    771 		alu.dst.sel = gpr_front;
    772 		alu.src[0].sel = ctx->face_gpr;
    773 		alu.src[1].sel = gpr_front;
    774 		alu.src[2].sel = gpr_back;
    775 
    776 		alu.dst.chan = i;
    777 		alu.src[1].chan = i;
    778 		alu.src[2].chan = i;
    779 		alu.last = (i==3);
    780 
    781 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
    782 			return r;
    783 	}
    784 
    785 	return 0;
    786 }
    787 
    788 static int tgsi_declaration(struct r600_shader_ctx *ctx)
    789 {
    790 	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
    791 	unsigned i;
    792 	int r;
    793 
    794 	switch (d->Declaration.File) {
    795 	case TGSI_FILE_INPUT:
    796 		i = ctx->shader->ninput++;
    797 		ctx->shader->input[i].name = d->Semantic.Name;
    798 		ctx->shader->input[i].sid = d->Semantic.Index;
    799 		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
    800 		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
    801 		ctx->shader->input[i].centroid = d->Interp.Centroid;
    802 		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
    803 		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
    804 			switch (ctx->shader->input[i].name) {
    805 			case TGSI_SEMANTIC_FACE:
    806 				ctx->face_gpr = ctx->shader->input[i].gpr;
    807 				break;
    808 			case TGSI_SEMANTIC_COLOR:
    809 				ctx->colors_used++;
    810 				break;
    811 			case TGSI_SEMANTIC_POSITION:
    812 				ctx->fragcoord_input = i;
    813 				break;
    814 			}
    815 			if (ctx->bc->chip_class >= EVERGREEN) {
    816 				if ((r = evergreen_interp_input(ctx, i)))
    817 					return r;
    818 			}
    819 		}
    820 		break;
    821 	case TGSI_FILE_OUTPUT:
    822 		i = ctx->shader->noutput++;
    823 		ctx->shader->output[i].name = d->Semantic.Name;
    824 		ctx->shader->output[i].sid = d->Semantic.Index;
    825 		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
    826 		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
    827 		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
    828 		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
    829 		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
    830 			switch (d->Semantic.Name) {
    831 			case TGSI_SEMANTIC_CLIPDIST:
    832 				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
    833 				break;
    834 			case TGSI_SEMANTIC_PSIZE:
    835 				ctx->shader->vs_out_misc_write = 1;
    836 				ctx->shader->vs_out_point_size = 1;
    837 				break;
    838 			case TGSI_SEMANTIC_CLIPVERTEX:
    839 				ctx->clip_vertex_write = TRUE;
    840 				ctx->cv_output = i;
    841 				break;
    842 			}
    843 		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
    844 			switch (d->Semantic.Name) {
    845 			case TGSI_SEMANTIC_COLOR:
    846 				ctx->shader->nr_ps_max_color_exports++;
    847 				break;
    848 			}
    849 		}
    850 		break;
    851 	case TGSI_FILE_CONSTANT:
    852 	case TGSI_FILE_TEMPORARY:
    853 	case TGSI_FILE_SAMPLER:
    854 	case TGSI_FILE_ADDRESS:
    855 		break;
    856 
    857 	case TGSI_FILE_SYSTEM_VALUE:
    858 		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
    859 			if (!ctx->native_integers) {
    860 				struct r600_bytecode_alu alu;
    861 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    862 
    863 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
    864 				alu.src[0].sel = 0;
    865 				alu.src[0].chan = 3;
    866 
    867 				alu.dst.sel = 0;
    868 				alu.dst.chan = 3;
    869 				alu.dst.write = 1;
    870 				alu.last = 1;
    871 
    872 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
    873 					return r;
    874 			}
    875 			break;
    876 		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
    877 			break;
    878 	default:
    879 		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
    880 		return -EINVAL;
    881 	}
    882 	return 0;
    883 }
    884 
    885 static int r600_get_temp(struct r600_shader_ctx *ctx)
    886 {
    887 	return ctx->temp_reg + ctx->max_driver_temp_used++;
    888 }
    889 
    890 /*
    891  * for evergreen we need to scan the shader to find the number of GPRs we need to
    892  * reserve for interpolation.
    893  *
    894  * we need to know if we are going to emit
    895  * any centroid inputs
    896  * if perspective and linear are required
    897 */
    898 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
    899 {
    900 	int i;
    901 	int num_baryc;
    902 
    903 	ctx->input_linear = FALSE;
    904 	ctx->input_perspective = FALSE;
    905 	ctx->input_centroid = FALSE;
    906 	ctx->num_interp_gpr = 1;
    907 
    908 	/* any centroid inputs */
    909 	for (i = 0; i < ctx->info.num_inputs; i++) {
    910 		/* skip position/face */
    911 		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
    912 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
    913 			continue;
    914 		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
    915 			ctx->input_linear = TRUE;
    916 		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
    917 			ctx->input_perspective = TRUE;
    918 		if (ctx->info.input_centroid[i])
    919 			ctx->input_centroid = TRUE;
    920 	}
    921 
    922 	num_baryc = 0;
    923 	/* ignoring sample for now */
    924 	if (ctx->input_perspective)
    925 		num_baryc++;
    926 	if (ctx->input_linear)
    927 		num_baryc++;
    928 	if (ctx->input_centroid)
    929 		num_baryc *= 2;
    930 
    931 	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
    932 
    933 	/* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
    934 	return ctx->num_interp_gpr;
    935 }
    936 
    937 static void tgsi_src(struct r600_shader_ctx *ctx,
    938 		     const struct tgsi_full_src_register *tgsi_src,
    939 		     struct r600_shader_src *r600_src)
    940 {
    941 	memset(r600_src, 0, sizeof(*r600_src));
    942 	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
    943 	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
    944 	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
    945 	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
    946 	r600_src->neg = tgsi_src->Register.Negate;
    947 	r600_src->abs = tgsi_src->Register.Absolute;
    948 
    949 	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
    950 		int index;
    951 		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
    952 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
    953 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
    954 
    955 			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
    956 			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
    957 			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
    958 				return;
    959 		}
    960 		index = tgsi_src->Register.Index;
    961 		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
    962 		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
    963 	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
    964 		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
    965 			r600_src->swizzle[0] = 3;
    966 			r600_src->swizzle[1] = 3;
    967 			r600_src->swizzle[2] = 3;
    968 			r600_src->swizzle[3] = 3;
    969 			r600_src->sel = 0;
    970 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
    971 			r600_src->swizzle[0] = 0;
    972 			r600_src->swizzle[1] = 0;
    973 			r600_src->swizzle[2] = 0;
    974 			r600_src->swizzle[3] = 0;
    975 			r600_src->sel = 0;
    976 		}
    977 	} else {
    978 		if (tgsi_src->Register.Indirect)
    979 			r600_src->rel = V_SQ_REL_RELATIVE;
    980 		r600_src->sel = tgsi_src->Register.Index;
    981 		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
    982 	}
    983 }
    984 
    985 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int offset, unsigned int dst_reg)
    986 {
    987 	struct r600_bytecode_vtx vtx;
    988 	unsigned int ar_reg;
    989 	int r;
    990 
    991 	if (offset) {
    992 		struct r600_bytecode_alu alu;
    993 
    994 		memset(&alu, 0, sizeof(alu));
    995 
    996 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
    997 		alu.src[0].sel = ctx->bc->ar_reg;
    998 
    999 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   1000 		alu.src[1].value = offset;
   1001 
   1002 		alu.dst.sel = dst_reg;
   1003 		alu.dst.write = 1;
   1004 		alu.last = 1;
   1005 
   1006 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   1007 			return r;
   1008 
   1009 		ar_reg = dst_reg;
   1010 	} else {
   1011 		ar_reg = ctx->bc->ar_reg;
   1012 	}
   1013 
   1014 	memset(&vtx, 0, sizeof(vtx));
   1015 	vtx.fetch_type = 2;		/* VTX_FETCH_NO_INDEX_OFFSET */
   1016 	vtx.src_gpr = ar_reg;
   1017 	vtx.mega_fetch_count = 16;
   1018 	vtx.dst_gpr = dst_reg;
   1019 	vtx.dst_sel_x = 0;		/* SEL_X */
   1020 	vtx.dst_sel_y = 1;		/* SEL_Y */
   1021 	vtx.dst_sel_z = 2;		/* SEL_Z */
   1022 	vtx.dst_sel_w = 3;		/* SEL_W */
   1023 	vtx.data_format = FMT_32_32_32_32_FLOAT;
   1024 	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
   1025 	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
   1026 	vtx.srf_mode_all = 1;		/* SRF_MODE_NO_ZERO */
   1027 	vtx.endian = r600_endian_swap(32);
   1028 
   1029 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
   1030 		return r;
   1031 
   1032 	return 0;
   1033 }
   1034 
   1035 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
   1036 {
   1037 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1038 	struct r600_bytecode_alu alu;
   1039 	int i, j, k, nconst, r;
   1040 
   1041 	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
   1042 		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
   1043 			nconst++;
   1044 		}
   1045 		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
   1046 	}
   1047 	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
   1048 		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
   1049 			continue;
   1050 		}
   1051 
   1052 		if (ctx->src[i].rel) {
   1053 			int treg = r600_get_temp(ctx);
   1054 			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].sel - 512, treg)))
   1055 				return r;
   1056 
   1057 			ctx->src[i].sel = treg;
   1058 			ctx->src[i].rel = 0;
   1059 			j--;
   1060 		} else if (j > 0) {
   1061 			int treg = r600_get_temp(ctx);
   1062 			for (k = 0; k < 4; k++) {
   1063 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1064 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   1065 				alu.src[0].sel = ctx->src[i].sel;
   1066 				alu.src[0].chan = k;
   1067 				alu.src[0].rel = ctx->src[i].rel;
   1068 				alu.dst.sel = treg;
   1069 				alu.dst.chan = k;
   1070 				alu.dst.write = 1;
   1071 				if (k == 3)
   1072 					alu.last = 1;
   1073 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   1074 				if (r)
   1075 					return r;
   1076 			}
   1077 			ctx->src[i].sel = treg;
   1078 			ctx->src[i].rel =0;
   1079 			j--;
   1080 		}
   1081 	}
   1082 	return 0;
   1083 }
   1084 
   1085 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
   1086 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
   1087 {
   1088 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1089 	struct r600_bytecode_alu alu;
   1090 	int i, j, k, nliteral, r;
   1091 
   1092 	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
   1093 		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
   1094 			nliteral++;
   1095 		}
   1096 	}
   1097 	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
   1098 		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
   1099 			int treg = r600_get_temp(ctx);
   1100 			for (k = 0; k < 4; k++) {
   1101 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1102 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   1103 				alu.src[0].sel = ctx->src[i].sel;
   1104 				alu.src[0].chan = k;
   1105 				alu.src[0].value = ctx->src[i].value[k];
   1106 				alu.dst.sel = treg;
   1107 				alu.dst.chan = k;
   1108 				alu.dst.write = 1;
   1109 				if (k == 3)
   1110 					alu.last = 1;
   1111 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   1112 				if (r)
   1113 					return r;
   1114 			}
   1115 			ctx->src[i].sel = treg;
   1116 			j--;
   1117 		}
   1118 	}
   1119 	return 0;
   1120 }
   1121 
   1122 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
   1123 {
   1124 	int i, r, count = ctx->shader->ninput;
   1125 
   1126 	/* additional inputs will be allocated right after the existing inputs,
   1127 	 * we won't need them after the color selection, so we don't need to
   1128 	 * reserve these gprs for the rest of the shader code and to adjust
   1129 	 * output offsets etc. */
   1130 	int gpr = ctx->file_offset[TGSI_FILE_INPUT] +
   1131 			ctx->info.file_max[TGSI_FILE_INPUT] + 1;
   1132 
   1133 	if (ctx->face_gpr == -1) {
   1134 		i = ctx->shader->ninput++;
   1135 		ctx->shader->input[i].name = TGSI_SEMANTIC_FACE;
   1136 		ctx->shader->input[i].spi_sid = 0;
   1137 		ctx->shader->input[i].gpr = gpr++;
   1138 		ctx->face_gpr = ctx->shader->input[i].gpr;
   1139 	}
   1140 
   1141 	for (i = 0; i < count; i++) {
   1142 		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
   1143 			int ni = ctx->shader->ninput++;
   1144 			memcpy(&ctx->shader->input[ni],&ctx->shader->input[i], sizeof(struct r600_shader_io));
   1145 			ctx->shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
   1146 			ctx->shader->input[ni].spi_sid = r600_spi_sid(&ctx->shader->input[ni]);
   1147 			ctx->shader->input[ni].gpr = gpr++;
   1148 
   1149 			if (ctx->bc->chip_class >= EVERGREEN) {
   1150 				r = evergreen_interp_input(ctx, ni);
   1151 				if (r)
   1152 					return r;
   1153 			}
   1154 
   1155 			r = select_twoside_color(ctx, i, ni);
   1156 			if (r)
   1157 				return r;
   1158 		}
   1159 	}
   1160 	return 0;
   1161 }
   1162 
   1163 static int r600_shader_from_tgsi(struct r600_context * rctx, struct r600_pipe_shader *pipeshader)
   1164 {
   1165 	struct r600_shader *shader = &pipeshader->shader;
   1166 	struct tgsi_token *tokens = pipeshader->selector->tokens;
   1167 	struct pipe_stream_output_info so = pipeshader->selector->so;
   1168 	struct tgsi_full_immediate *immediate;
   1169 	struct tgsi_full_property *property;
   1170 	struct r600_shader_ctx ctx;
   1171 	struct r600_bytecode_output output[32];
   1172 	unsigned output_done, noutput;
   1173 	unsigned opcode;
   1174 	int i, j, k, r = 0;
   1175 	int next_pixel_base = 0, next_pos_base = 60, next_param_base = 0;
   1176 	/* Declarations used by llvm code */
   1177 	bool use_llvm = false;
   1178 	unsigned char * inst_bytes = NULL;
   1179 	unsigned inst_byte_count = 0;
   1180 
   1181 #ifdef R600_USE_LLVM
   1182 	use_llvm = debug_get_bool_option("R600_LLVM", TRUE);
   1183 #endif
   1184 	ctx.bc = &shader->bc;
   1185 	ctx.shader = shader;
   1186 	ctx.native_integers = true;
   1187 
   1188 	r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family);
   1189 	ctx.tokens = tokens;
   1190 	tgsi_scan_shader(tokens, &ctx.info);
   1191 	tgsi_parse_init(&ctx.parse, tokens);
   1192 	ctx.type = ctx.parse.FullHeader.Processor.Processor;
   1193 	shader->processor_type = ctx.type;
   1194 	ctx.bc->type = shader->processor_type;
   1195 
   1196 	ctx.face_gpr = -1;
   1197 	ctx.fragcoord_input = -1;
   1198 	ctx.colors_used = 0;
   1199 	ctx.clip_vertex_write = 0;
   1200 
   1201 	shader->nr_ps_color_exports = 0;
   1202 	shader->nr_ps_max_color_exports = 0;
   1203 
   1204 	shader->two_side = (ctx.type == TGSI_PROCESSOR_FRAGMENT) && rctx->two_side;
   1205 
   1206 	/* register allocations */
   1207 	/* Values [0,127] correspond to GPR[0..127].
   1208 	 * Values [128,159] correspond to constant buffer bank 0
   1209 	 * Values [160,191] correspond to constant buffer bank 1
   1210 	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
   1211 	 * Values [256,287] correspond to constant buffer bank 2 (EG)
   1212 	 * Values [288,319] correspond to constant buffer bank 3 (EG)
   1213 	 * Other special values are shown in the list below.
   1214 	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
   1215 	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
   1216 	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
   1217 	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
   1218 	 * 248	SQ_ALU_SRC_0: special constant 0.0.
   1219 	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
   1220 	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
   1221 	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
   1222 	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
   1223 	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
   1224 	 * 254	SQ_ALU_SRC_PV: previous vector result.
   1225 	 * 255	SQ_ALU_SRC_PS: previous scalar result.
   1226 	 */
   1227 	for (i = 0; i < TGSI_FILE_COUNT; i++) {
   1228 		ctx.file_offset[i] = 0;
   1229 	}
   1230 	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
   1231 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
   1232 		if (ctx.bc->chip_class >= EVERGREEN) {
   1233 			r600_bytecode_add_cfinst(ctx.bc, EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
   1234 		} else {
   1235 			r600_bytecode_add_cfinst(ctx.bc, V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS);
   1236 		}
   1237 	}
   1238 	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
   1239 		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
   1240 	}
   1241 
   1242 	/* LLVM backend setup */
   1243 #ifdef R600_USE_LLVM
   1244 	if (use_llvm && ctx.info.indirect_files) {
   1245 		fprintf(stderr, "Warning: R600 LLVM backend does not support "
   1246 				"indirect adressing.  Falling back to TGSI "
   1247 				"backend.\n");
   1248 		use_llvm = 0;
   1249 	}
   1250 	if (use_llvm) {
   1251 		struct radeon_llvm_context radeon_llvm_ctx;
   1252 		LLVMModuleRef mod;
   1253 		unsigned dump = 0;
   1254 		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
   1255 		radeon_llvm_ctx.reserved_reg_count = ctx.file_offset[TGSI_FILE_INPUT];
   1256 		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
   1257 		if (debug_get_bool_option("R600_DUMP_SHADERS", FALSE)) {
   1258 			dump = 1;
   1259 		}
   1260 		if (r600_llvm_compile(mod, &inst_bytes, &inst_byte_count,
   1261 							rctx->family, dump)) {
   1262 			FREE(inst_bytes);
   1263 			radeon_llvm_dispose(&radeon_llvm_ctx);
   1264 			use_llvm = 0;
   1265 			fprintf(stderr, "R600 LLVM backend failed to compile "
   1266 				"shader.  Falling back to TGSI\n");
   1267 		} else {
   1268 			ctx.file_offset[TGSI_FILE_OUTPUT] =
   1269 					ctx.file_offset[TGSI_FILE_INPUT];
   1270 		}
   1271 		radeon_llvm_dispose(&radeon_llvm_ctx);
   1272 	}
   1273 #endif
   1274 	/* End of LLVM backend setup */
   1275 
   1276 	if (!use_llvm) {
   1277 		ctx.file_offset[TGSI_FILE_OUTPUT] =
   1278 			ctx.file_offset[TGSI_FILE_INPUT] +
   1279 			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
   1280 	}
   1281 	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
   1282 						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
   1283 
   1284 	/* Outside the GPR range. This will be translated to one of the
   1285 	 * kcache banks later. */
   1286 	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
   1287 
   1288 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
   1289 	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
   1290 			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
   1291 	ctx.temp_reg = ctx.bc->ar_reg + 1;
   1292 
   1293 	ctx.nliterals = 0;
   1294 	ctx.literals = NULL;
   1295 	shader->fs_write_all = FALSE;
   1296 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
   1297 		tgsi_parse_token(&ctx.parse);
   1298 		switch (ctx.parse.FullToken.Token.Type) {
   1299 		case TGSI_TOKEN_TYPE_IMMEDIATE:
   1300 			immediate = &ctx.parse.FullToken.FullImmediate;
   1301 			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
   1302 			if(ctx.literals == NULL) {
   1303 				r = -ENOMEM;
   1304 				goto out_err;
   1305 			}
   1306 			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
   1307 			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
   1308 			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
   1309 			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
   1310 			ctx.nliterals++;
   1311 			break;
   1312 		case TGSI_TOKEN_TYPE_DECLARATION:
   1313 			r = tgsi_declaration(&ctx);
   1314 			if (r)
   1315 				goto out_err;
   1316 			break;
   1317 		case TGSI_TOKEN_TYPE_INSTRUCTION:
   1318 			break;
   1319 		case TGSI_TOKEN_TYPE_PROPERTY:
   1320 			property = &ctx.parse.FullToken.FullProperty;
   1321 			switch (property->Property.PropertyName) {
   1322 			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
   1323 				if (property->u[0].Data == 1)
   1324 					shader->fs_write_all = TRUE;
   1325 				break;
   1326 			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
   1327 				if (property->u[0].Data == 1)
   1328 					shader->vs_prohibit_ucps = TRUE;
   1329 				break;
   1330 			}
   1331 			break;
   1332 		default:
   1333 			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
   1334 			r = -EINVAL;
   1335 			goto out_err;
   1336 		}
   1337 	}
   1338 
   1339 	if (shader->fs_write_all && rctx->chip_class >= EVERGREEN)
   1340 		shader->nr_ps_max_color_exports = 8;
   1341 
   1342 	if (ctx.fragcoord_input >= 0) {
   1343 		if (ctx.bc->chip_class == CAYMAN) {
   1344 			for (j = 0 ; j < 4; j++) {
   1345 				struct r600_bytecode_alu alu;
   1346 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1347 				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
   1348 				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
   1349 				alu.src[0].chan = 3;
   1350 
   1351 				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
   1352 				alu.dst.chan = j;
   1353 				alu.dst.write = (j == 3);
   1354 				alu.last = 1;
   1355 				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
   1356 					return r;
   1357 			}
   1358 		} else {
   1359 			struct r600_bytecode_alu alu;
   1360 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1361 			alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
   1362 			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
   1363 			alu.src[0].chan = 3;
   1364 
   1365 			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
   1366 			alu.dst.chan = 3;
   1367 			alu.dst.write = 1;
   1368 			alu.last = 1;
   1369 			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
   1370 				return r;
   1371 		}
   1372 	}
   1373 
   1374 	if (shader->two_side && ctx.colors_used) {
   1375 		if ((r = process_twoside_color_inputs(&ctx)))
   1376 			return r;
   1377 	}
   1378 
   1379 	tgsi_parse_init(&ctx.parse, tokens);
   1380 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
   1381 		tgsi_parse_token(&ctx.parse);
   1382 		switch (ctx.parse.FullToken.Token.Type) {
   1383 		case TGSI_TOKEN_TYPE_INSTRUCTION:
   1384 			if (use_llvm) {
   1385 				continue;
   1386 			}
   1387 			r = tgsi_is_supported(&ctx);
   1388 			if (r)
   1389 				goto out_err;
   1390 			ctx.max_driver_temp_used = 0;
   1391 			/* reserve first tmp for everyone */
   1392 			r600_get_temp(&ctx);
   1393 
   1394 			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
   1395 			if ((r = tgsi_split_constant(&ctx)))
   1396 				goto out_err;
   1397 			if ((r = tgsi_split_literal_constant(&ctx)))
   1398 				goto out_err;
   1399 			if (ctx.bc->chip_class == CAYMAN)
   1400 				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
   1401 			else if (ctx.bc->chip_class >= EVERGREEN)
   1402 				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
   1403 			else
   1404 				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
   1405 			r = ctx.inst_info->process(&ctx);
   1406 			if (r)
   1407 				goto out_err;
   1408 			break;
   1409 		default:
   1410 			break;
   1411 		}
   1412 	}
   1413 
   1414 	/* Get instructions if we are using the LLVM backend. */
   1415 	if (use_llvm) {
   1416 		r600_bytecode_from_byte_stream(&ctx, inst_bytes, inst_byte_count);
   1417 		FREE(inst_bytes);
   1418 	}
   1419 
   1420 	noutput = shader->noutput;
   1421 
   1422 	if (ctx.clip_vertex_write) {
   1423 		/* need to convert a clipvertex write into clipdistance writes and not export
   1424 		   the clip vertex anymore */
   1425 
   1426 		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
   1427 		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
   1428 		shader->output[noutput].gpr = ctx.temp_reg;
   1429 		noutput++;
   1430 		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
   1431 		shader->output[noutput].gpr = ctx.temp_reg+1;
   1432 		noutput++;
   1433 
   1434 		/* reset spi_sid for clipvertex output to avoid confusing spi */
   1435 		shader->output[ctx.cv_output].spi_sid = 0;
   1436 
   1437 		shader->clip_dist_write = 0xFF;
   1438 
   1439 		for (i = 0; i < 8; i++) {
   1440 			int oreg = i >> 2;
   1441 			int ochan = i & 3;
   1442 
   1443 			for (j = 0; j < 4; j++) {
   1444 				struct r600_bytecode_alu alu;
   1445 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1446 				alu.inst = BC_INST(ctx.bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4);
   1447 				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
   1448 				alu.src[0].chan = j;
   1449 
   1450 				alu.src[1].sel = 512 + i;
   1451 				alu.src[1].kc_bank = 1;
   1452 				alu.src[1].chan = j;
   1453 
   1454 				alu.dst.sel = ctx.temp_reg + oreg;
   1455 				alu.dst.chan = j;
   1456 				alu.dst.write = (j == ochan);
   1457 				if (j == 3)
   1458 					alu.last = 1;
   1459 				r = r600_bytecode_add_alu(ctx.bc, &alu);
   1460 				if (r)
   1461 					return r;
   1462 			}
   1463 		}
   1464 	}
   1465 
   1466 	/* Add stream outputs. */
   1467 	if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
   1468 		for (i = 0; i < so.num_outputs; i++) {
   1469 			struct r600_bytecode_output output;
   1470 
   1471 			if (so.output[i].output_buffer >= 4) {
   1472 				R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
   1473 					 so.output[i].output_buffer);
   1474 				r = -EINVAL;
   1475 				goto out_err;
   1476 			}
   1477 			if (so.output[i].dst_offset < so.output[i].start_component) {
   1478 			   R600_ERR("stream_output - dst_offset cannot be less than start_component\n");
   1479 			   r = -EINVAL;
   1480 			   goto out_err;
   1481 			}
   1482 
   1483 			memset(&output, 0, sizeof(struct r600_bytecode_output));
   1484 			output.gpr = shader->output[so.output[i].register_index].gpr;
   1485 			output.elem_size = 0;
   1486 			output.array_base = so.output[i].dst_offset - so.output[i].start_component;
   1487 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
   1488 			output.burst_count = 1;
   1489 			output.barrier = 1;
   1490 			/* array_size is an upper limit for the burst_count
   1491 			 * with MEM_STREAM instructions */
   1492 			output.array_size = 0xFFF;
   1493 			output.comp_mask = ((1 << so.output[i].num_components) - 1) << so.output[i].start_component;
   1494 			if (ctx.bc->chip_class >= EVERGREEN) {
   1495 				switch (so.output[i].output_buffer) {
   1496 				case 0:
   1497 					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
   1498 					break;
   1499 				case 1:
   1500 					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
   1501 					break;
   1502 				case 2:
   1503 					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
   1504 					break;
   1505 				case 3:
   1506 					output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
   1507 					break;
   1508 				}
   1509 			} else {
   1510 				switch (so.output[i].output_buffer) {
   1511 				case 0:
   1512 					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
   1513 					break;
   1514 				case 1:
   1515 					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
   1516 					break;
   1517 				case 2:
   1518 					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
   1519 					break;
   1520 				case 3:
   1521 					output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
   1522 					break;
   1523 				}
   1524 			}
   1525 			r = r600_bytecode_add_output(ctx.bc, &output);
   1526 			if (r)
   1527 				goto out_err;
   1528 		}
   1529 	}
   1530 
   1531 	/* export output */
   1532 	for (i = 0, j = 0; i < noutput; i++, j++) {
   1533 		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   1534 		output[j].gpr = shader->output[i].gpr;
   1535 		output[j].elem_size = 3;
   1536 		output[j].swizzle_x = 0;
   1537 		output[j].swizzle_y = 1;
   1538 		output[j].swizzle_z = 2;
   1539 		output[j].swizzle_w = 3;
   1540 		output[j].burst_count = 1;
   1541 		output[j].barrier = 1;
   1542 		output[j].type = -1;
   1543 		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
   1544 		switch (ctx.type) {
   1545 		case TGSI_PROCESSOR_VERTEX:
   1546 			switch (shader->output[i].name) {
   1547 			case TGSI_SEMANTIC_POSITION:
   1548 				output[j].array_base = next_pos_base++;
   1549 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   1550 				break;
   1551 
   1552 			case TGSI_SEMANTIC_PSIZE:
   1553 				output[j].array_base = next_pos_base++;
   1554 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   1555 				break;
   1556 			case TGSI_SEMANTIC_CLIPVERTEX:
   1557 				j--;
   1558 				break;
   1559 			case TGSI_SEMANTIC_CLIPDIST:
   1560 				output[j].array_base = next_pos_base++;
   1561 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   1562 				/* spi_sid is 0 for clipdistance outputs that were generated
   1563 				 * for clipvertex - we don't need to pass them to PS */
   1564 				if (shader->output[i].spi_sid) {
   1565 					j++;
   1566 					/* duplicate it as PARAM to pass to the pixel shader */
   1567 					memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
   1568 					output[j].array_base = next_param_base++;
   1569 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   1570 				}
   1571 				break;
   1572 			case TGSI_SEMANTIC_FOG:
   1573 				output[j].swizzle_y = 4; /* 0 */
   1574 				output[j].swizzle_z = 4; /* 0 */
   1575 				output[j].swizzle_w = 5; /* 1 */
   1576 				break;
   1577 			}
   1578 			break;
   1579 		case TGSI_PROCESSOR_FRAGMENT:
   1580 			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
   1581 				/* never export more colors than the number of CBs */
   1582 				if (next_pixel_base && next_pixel_base >= (rctx->nr_cbufs + rctx->dual_src_blend * 1)) {
   1583 					/* skip export */
   1584 					j--;
   1585 					continue;
   1586 				}
   1587 				output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
   1588 				output[j].array_base = next_pixel_base++;
   1589 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   1590 				shader->nr_ps_color_exports++;
   1591 				if (shader->fs_write_all && (rctx->chip_class >= EVERGREEN)) {
   1592 					for (k = 1; k < rctx->nr_cbufs; k++) {
   1593 						j++;
   1594 						memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   1595 						output[j].gpr = shader->output[i].gpr;
   1596 						output[j].elem_size = 3;
   1597 						output[j].swizzle_x = 0;
   1598 						output[j].swizzle_y = 1;
   1599 						output[j].swizzle_z = 2;
   1600 						output[j].swizzle_w = rctx->alpha_to_one && rctx->multisample_enable && !rctx->cb0_is_integer ? 5 : 3;
   1601 						output[j].burst_count = 1;
   1602 						output[j].barrier = 1;
   1603 						output[j].array_base = next_pixel_base++;
   1604 						output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
   1605 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   1606 						shader->nr_ps_color_exports++;
   1607 					}
   1608 				}
   1609 			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
   1610 				output[j].array_base = 61;
   1611 				output[j].swizzle_x = 2;
   1612 				output[j].swizzle_y = 7;
   1613 				output[j].swizzle_z = output[j].swizzle_w = 7;
   1614 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   1615 			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
   1616 				output[j].array_base = 61;
   1617 				output[j].swizzle_x = 7;
   1618 				output[j].swizzle_y = 1;
   1619 				output[j].swizzle_z = output[j].swizzle_w = 7;
   1620 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   1621 			} else {
   1622 				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
   1623 				r = -EINVAL;
   1624 				goto out_err;
   1625 			}
   1626 			break;
   1627 		default:
   1628 			R600_ERR("unsupported processor type %d\n", ctx.type);
   1629 			r = -EINVAL;
   1630 			goto out_err;
   1631 		}
   1632 
   1633 		if (output[j].type==-1) {
   1634 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   1635 			output[j].array_base = next_param_base++;
   1636 		}
   1637 	}
   1638 
   1639 	/* add fake param output for vertex shader if no param is exported */
   1640 	if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
   1641 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   1642 			output[j].gpr = 0;
   1643 			output[j].elem_size = 3;
   1644 			output[j].swizzle_x = 7;
   1645 			output[j].swizzle_y = 7;
   1646 			output[j].swizzle_z = 7;
   1647 			output[j].swizzle_w = 7;
   1648 			output[j].burst_count = 1;
   1649 			output[j].barrier = 1;
   1650 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   1651 			output[j].array_base = 0;
   1652 			output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
   1653 			j++;
   1654 	}
   1655 
   1656 	/* add fake pixel export */
   1657 	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && next_pixel_base == 0) {
   1658 		memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   1659 		output[j].gpr = 0;
   1660 		output[j].elem_size = 3;
   1661 		output[j].swizzle_x = 7;
   1662 		output[j].swizzle_y = 7;
   1663 		output[j].swizzle_z = 7;
   1664 		output[j].swizzle_w = 7;
   1665 		output[j].burst_count = 1;
   1666 		output[j].barrier = 1;
   1667 		output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   1668 		output[j].array_base = 0;
   1669 		output[j].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT);
   1670 		j++;
   1671 	}
   1672 
   1673 	noutput = j;
   1674 
   1675 	/* set export done on last export of each type */
   1676 	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
   1677 		if (ctx.bc->chip_class < CAYMAN) {
   1678 			if (i == (noutput - 1)) {
   1679 				output[i].end_of_program = 1;
   1680 			}
   1681 		}
   1682 		if (!(output_done & (1 << output[i].type))) {
   1683 			output_done |= (1 << output[i].type);
   1684 			output[i].inst = BC_INST(ctx.bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE);
   1685 		}
   1686 	}
   1687 	/* add output to bytecode */
   1688 	for (i = 0; i < noutput; i++) {
   1689 		r = r600_bytecode_add_output(ctx.bc, &output[i]);
   1690 		if (r)
   1691 			goto out_err;
   1692 	}
   1693 	/* add program end */
   1694 	if (ctx.bc->chip_class == CAYMAN)
   1695 		cm_bytecode_add_cf_end(ctx.bc);
   1696 
   1697 	/* check GPR limit - we have 124 = 128 - 4
   1698 	 * (4 are reserved as alu clause temporary registers) */
   1699 	if (ctx.bc->ngpr > 124) {
   1700 		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
   1701 		r = -ENOMEM;
   1702 		goto out_err;
   1703 	}
   1704 
   1705 	free(ctx.literals);
   1706 	tgsi_parse_free(&ctx.parse);
   1707 	return 0;
   1708 out_err:
   1709 	free(ctx.literals);
   1710 	tgsi_parse_free(&ctx.parse);
   1711 	return r;
   1712 }
   1713 
   1714 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
   1715 {
   1716 	R600_ERR("%s tgsi opcode unsupported\n",
   1717 		 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
   1718 	return -EINVAL;
   1719 }
   1720 
   1721 static int tgsi_end(struct r600_shader_ctx *ctx)
   1722 {
   1723 	return 0;
   1724 }
   1725 
   1726 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
   1727 			const struct r600_shader_src *shader_src,
   1728 			unsigned chan)
   1729 {
   1730 	bc_src->sel = shader_src->sel;
   1731 	bc_src->chan = shader_src->swizzle[chan];
   1732 	bc_src->neg = shader_src->neg;
   1733 	bc_src->abs = shader_src->abs;
   1734 	bc_src->rel = shader_src->rel;
   1735 	bc_src->value = shader_src->value[bc_src->chan];
   1736 }
   1737 
   1738 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
   1739 {
   1740 	bc_src->abs = 1;
   1741 	bc_src->neg = 0;
   1742 }
   1743 
   1744 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
   1745 {
   1746 	bc_src->neg = !bc_src->neg;
   1747 }
   1748 
   1749 static void tgsi_dst(struct r600_shader_ctx *ctx,
   1750 		     const struct tgsi_full_dst_register *tgsi_dst,
   1751 		     unsigned swizzle,
   1752 		     struct r600_bytecode_alu_dst *r600_dst)
   1753 {
   1754 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1755 
   1756 	r600_dst->sel = tgsi_dst->Register.Index;
   1757 	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
   1758 	r600_dst->chan = swizzle;
   1759 	r600_dst->write = 1;
   1760 	if (tgsi_dst->Register.Indirect)
   1761 		r600_dst->rel = V_SQ_REL_RELATIVE;
   1762 	if (inst->Instruction.Saturate) {
   1763 		r600_dst->clamp = 1;
   1764 	}
   1765 }
   1766 
   1767 static int tgsi_last_instruction(unsigned writemask)
   1768 {
   1769 	int i, lasti = 0;
   1770 
   1771 	for (i = 0; i < 4; i++) {
   1772 		if (writemask & (1 << i)) {
   1773 			lasti = i;
   1774 		}
   1775 	}
   1776 	return lasti;
   1777 }
   1778 
   1779 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
   1780 {
   1781 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1782 	struct r600_bytecode_alu alu;
   1783 	int i, j, r;
   1784 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   1785 
   1786 	for (i = 0; i < lasti + 1; i++) {
   1787 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   1788 			continue;
   1789 
   1790 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1791 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   1792 
   1793 		alu.inst = ctx->inst_info->r600_opcode;
   1794 		if (!swap) {
   1795 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   1796 				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
   1797 			}
   1798 		} else {
   1799 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   1800 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   1801 		}
   1802 		/* handle some special cases */
   1803 		switch (ctx->inst_info->tgsi_opcode) {
   1804 		case TGSI_OPCODE_SUB:
   1805 			r600_bytecode_src_toggle_neg(&alu.src[1]);
   1806 			break;
   1807 		case TGSI_OPCODE_ABS:
   1808 			r600_bytecode_src_set_abs(&alu.src[0]);
   1809 			break;
   1810 		default:
   1811 			break;
   1812 		}
   1813 		if (i == lasti || trans_only) {
   1814 			alu.last = 1;
   1815 		}
   1816 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   1817 		if (r)
   1818 			return r;
   1819 	}
   1820 	return 0;
   1821 }
   1822 
   1823 static int tgsi_op2(struct r600_shader_ctx *ctx)
   1824 {
   1825 	return tgsi_op2_s(ctx, 0, 0);
   1826 }
   1827 
   1828 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
   1829 {
   1830 	return tgsi_op2_s(ctx, 1, 0);
   1831 }
   1832 
   1833 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
   1834 {
   1835 	return tgsi_op2_s(ctx, 0, 1);
   1836 }
   1837 
   1838 static int tgsi_ineg(struct r600_shader_ctx *ctx)
   1839 {
   1840 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1841 	struct r600_bytecode_alu alu;
   1842 	int i, r;
   1843 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   1844 
   1845 	for (i = 0; i < lasti + 1; i++) {
   1846 
   1847 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   1848 			continue;
   1849 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1850 		alu.inst = ctx->inst_info->r600_opcode;
   1851 
   1852 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   1853 
   1854 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   1855 
   1856 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   1857 
   1858 		if (i == lasti) {
   1859 			alu.last = 1;
   1860 		}
   1861 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   1862 		if (r)
   1863 			return r;
   1864 	}
   1865 	return 0;
   1866 
   1867 }
   1868 
   1869 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
   1870 {
   1871 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1872 	int i, j, r;
   1873 	struct r600_bytecode_alu alu;
   1874 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
   1875 
   1876 	for (i = 0 ; i < last_slot; i++) {
   1877 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1878 		alu.inst = ctx->inst_info->r600_opcode;
   1879 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   1880 			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
   1881 
   1882 			/* RSQ should take the absolute value of src */
   1883 			if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
   1884 				r600_bytecode_src_set_abs(&alu.src[j]);
   1885 			}
   1886 		}
   1887 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   1888 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   1889 
   1890 		if (i == last_slot - 1)
   1891 			alu.last = 1;
   1892 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   1893 		if (r)
   1894 			return r;
   1895 	}
   1896 	return 0;
   1897 }
   1898 
   1899 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
   1900 {
   1901 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1902 	int i, j, k, r;
   1903 	struct r600_bytecode_alu alu;
   1904 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
   1905 	for (k = 0; k < last_slot; k++) {
   1906 		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
   1907 			continue;
   1908 
   1909 		for (i = 0 ; i < 4; i++) {
   1910 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1911 			alu.inst = ctx->inst_info->r600_opcode;
   1912 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   1913 				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
   1914 			}
   1915 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   1916 			alu.dst.write = (i == k);
   1917 			if (i == 3)
   1918 				alu.last = 1;
   1919 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   1920 			if (r)
   1921 				return r;
   1922 		}
   1923 	}
   1924 	return 0;
   1925 }
   1926 
   1927 /*
   1928  * r600 - trunc to -PI..PI range
   1929  * r700 - normalize by dividing by 2PI
   1930  * see fdo bug 27901
   1931  */
   1932 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
   1933 {
   1934 	static float half_inv_pi = 1.0 /(3.1415926535 * 2);
   1935 	static float double_pi = 3.1415926535 * 2;
   1936 	static float neg_pi = -3.1415926535;
   1937 
   1938 	int r;
   1939 	struct r600_bytecode_alu alu;
   1940 
   1941 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1942 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
   1943 	alu.is_op3 = 1;
   1944 
   1945 	alu.dst.chan = 0;
   1946 	alu.dst.sel = ctx->temp_reg;
   1947 	alu.dst.write = 1;
   1948 
   1949 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   1950 
   1951 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   1952 	alu.src[1].chan = 0;
   1953 	alu.src[1].value = *(uint32_t *)&half_inv_pi;
   1954 	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
   1955 	alu.src[2].chan = 0;
   1956 	alu.last = 1;
   1957 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   1958 	if (r)
   1959 		return r;
   1960 
   1961 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1962 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
   1963 
   1964 	alu.dst.chan = 0;
   1965 	alu.dst.sel = ctx->temp_reg;
   1966 	alu.dst.write = 1;
   1967 
   1968 	alu.src[0].sel = ctx->temp_reg;
   1969 	alu.src[0].chan = 0;
   1970 	alu.last = 1;
   1971 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   1972 	if (r)
   1973 		return r;
   1974 
   1975 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1976 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
   1977 	alu.is_op3 = 1;
   1978 
   1979 	alu.dst.chan = 0;
   1980 	alu.dst.sel = ctx->temp_reg;
   1981 	alu.dst.write = 1;
   1982 
   1983 	alu.src[0].sel = ctx->temp_reg;
   1984 	alu.src[0].chan = 0;
   1985 
   1986 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   1987 	alu.src[1].chan = 0;
   1988 	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
   1989 	alu.src[2].chan = 0;
   1990 
   1991 	if (ctx->bc->chip_class == R600) {
   1992 		alu.src[1].value = *(uint32_t *)&double_pi;
   1993 		alu.src[2].value = *(uint32_t *)&neg_pi;
   1994 	} else {
   1995 		alu.src[1].sel = V_SQ_ALU_SRC_1;
   1996 		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
   1997 		alu.src[2].neg = 1;
   1998 	}
   1999 
   2000 	alu.last = 1;
   2001 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2002 	if (r)
   2003 		return r;
   2004 	return 0;
   2005 }
   2006 
   2007 static int cayman_trig(struct r600_shader_ctx *ctx)
   2008 {
   2009 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2010 	struct r600_bytecode_alu alu;
   2011 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
   2012 	int i, r;
   2013 
   2014 	r = tgsi_setup_trig(ctx);
   2015 	if (r)
   2016 		return r;
   2017 
   2018 
   2019 	for (i = 0; i < last_slot; i++) {
   2020 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2021 		alu.inst = ctx->inst_info->r600_opcode;
   2022 		alu.dst.chan = i;
   2023 
   2024 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   2025 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   2026 
   2027 		alu.src[0].sel = ctx->temp_reg;
   2028 		alu.src[0].chan = 0;
   2029 		if (i == last_slot - 1)
   2030 			alu.last = 1;
   2031 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2032 		if (r)
   2033 			return r;
   2034 	}
   2035 	return 0;
   2036 }
   2037 
   2038 static int tgsi_trig(struct r600_shader_ctx *ctx)
   2039 {
   2040 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2041 	struct r600_bytecode_alu alu;
   2042 	int i, r;
   2043 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   2044 
   2045 	r = tgsi_setup_trig(ctx);
   2046 	if (r)
   2047 		return r;
   2048 
   2049 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2050 	alu.inst = ctx->inst_info->r600_opcode;
   2051 	alu.dst.chan = 0;
   2052 	alu.dst.sel = ctx->temp_reg;
   2053 	alu.dst.write = 1;
   2054 
   2055 	alu.src[0].sel = ctx->temp_reg;
   2056 	alu.src[0].chan = 0;
   2057 	alu.last = 1;
   2058 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2059 	if (r)
   2060 		return r;
   2061 
   2062 	/* replicate result */
   2063 	for (i = 0; i < lasti + 1; i++) {
   2064 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   2065 			continue;
   2066 
   2067 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2068 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   2069 
   2070 		alu.src[0].sel = ctx->temp_reg;
   2071 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   2072 		if (i == lasti)
   2073 			alu.last = 1;
   2074 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2075 		if (r)
   2076 			return r;
   2077 	}
   2078 	return 0;
   2079 }
   2080 
   2081 static int tgsi_scs(struct r600_shader_ctx *ctx)
   2082 {
   2083 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2084 	struct r600_bytecode_alu alu;
   2085 	int i, r;
   2086 
   2087 	/* We'll only need the trig stuff if we are going to write to the
   2088 	 * X or Y components of the destination vector.
   2089 	 */
   2090 	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
   2091 		r = tgsi_setup_trig(ctx);
   2092 		if (r)
   2093 			return r;
   2094 	}
   2095 
   2096 	/* dst.x = COS */
   2097 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   2098 		if (ctx->bc->chip_class == CAYMAN) {
   2099 			for (i = 0 ; i < 3; i++) {
   2100 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2101 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
   2102 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   2103 
   2104 				if (i == 0)
   2105 					alu.dst.write = 1;
   2106 				else
   2107 					alu.dst.write = 0;
   2108 				alu.src[0].sel = ctx->temp_reg;
   2109 				alu.src[0].chan = 0;
   2110 				if (i == 2)
   2111 					alu.last = 1;
   2112 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   2113 				if (r)
   2114 					return r;
   2115 			}
   2116 		} else {
   2117 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2118 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS);
   2119 			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
   2120 
   2121 			alu.src[0].sel = ctx->temp_reg;
   2122 			alu.src[0].chan = 0;
   2123 			alu.last = 1;
   2124 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   2125 			if (r)
   2126 				return r;
   2127 		}
   2128 	}
   2129 
   2130 	/* dst.y = SIN */
   2131 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   2132 		if (ctx->bc->chip_class == CAYMAN) {
   2133 			for (i = 0 ; i < 3; i++) {
   2134 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2135 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
   2136 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   2137 				if (i == 1)
   2138 					alu.dst.write = 1;
   2139 				else
   2140 					alu.dst.write = 0;
   2141 				alu.src[0].sel = ctx->temp_reg;
   2142 				alu.src[0].chan = 0;
   2143 				if (i == 2)
   2144 					alu.last = 1;
   2145 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   2146 				if (r)
   2147 					return r;
   2148 			}
   2149 		} else {
   2150 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2151 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN);
   2152 			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
   2153 
   2154 			alu.src[0].sel = ctx->temp_reg;
   2155 			alu.src[0].chan = 0;
   2156 			alu.last = 1;
   2157 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   2158 			if (r)
   2159 				return r;
   2160 		}
   2161 	}
   2162 
   2163 	/* dst.z = 0.0; */
   2164 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   2165 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2166 
   2167 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   2168 
   2169 		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
   2170 
   2171 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   2172 		alu.src[0].chan = 0;
   2173 
   2174 		alu.last = 1;
   2175 
   2176 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2177 		if (r)
   2178 			return r;
   2179 	}
   2180 
   2181 	/* dst.w = 1.0; */
   2182 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   2183 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2184 
   2185 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   2186 
   2187 		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
   2188 
   2189 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   2190 		alu.src[0].chan = 0;
   2191 
   2192 		alu.last = 1;
   2193 
   2194 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2195 		if (r)
   2196 			return r;
   2197 	}
   2198 
   2199 	return 0;
   2200 }
   2201 
   2202 static int tgsi_kill(struct r600_shader_ctx *ctx)
   2203 {
   2204 	struct r600_bytecode_alu alu;
   2205 	int i, r;
   2206 
   2207 	for (i = 0; i < 4; i++) {
   2208 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2209 		alu.inst = ctx->inst_info->r600_opcode;
   2210 
   2211 		alu.dst.chan = i;
   2212 
   2213 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   2214 
   2215 		if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) {
   2216 			alu.src[1].sel = V_SQ_ALU_SRC_1;
   2217 			alu.src[1].neg = 1;
   2218 		} else {
   2219 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   2220 		}
   2221 		if (i == 3) {
   2222 			alu.last = 1;
   2223 		}
   2224 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2225 		if (r)
   2226 			return r;
   2227 	}
   2228 
   2229 	/* kill must be last in ALU */
   2230 	ctx->bc->force_add_cf = 1;
   2231 	ctx->shader->uses_kill = TRUE;
   2232 	return 0;
   2233 }
   2234 
   2235 static int tgsi_lit(struct r600_shader_ctx *ctx)
   2236 {
   2237 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2238 	struct r600_bytecode_alu alu;
   2239 	int r;
   2240 
   2241 	/* tmp.x = max(src.y, 0.0) */
   2242 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2243 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
   2244 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
   2245 	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
   2246 	alu.src[1].chan = 1;
   2247 
   2248 	alu.dst.sel = ctx->temp_reg;
   2249 	alu.dst.chan = 0;
   2250 	alu.dst.write = 1;
   2251 
   2252 	alu.last = 1;
   2253 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2254 	if (r)
   2255 		return r;
   2256 
   2257 	if (inst->Dst[0].Register.WriteMask & (1 << 2))
   2258 	{
   2259 		int chan;
   2260 		int sel;
   2261 		int i;
   2262 
   2263 		if (ctx->bc->chip_class == CAYMAN) {
   2264 			for (i = 0; i < 3; i++) {
   2265 				/* tmp.z = log(tmp.x) */
   2266 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2267 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
   2268 				alu.src[0].sel = ctx->temp_reg;
   2269 				alu.src[0].chan = 0;
   2270 				alu.dst.sel = ctx->temp_reg;
   2271 				alu.dst.chan = i;
   2272 				if (i == 2) {
   2273 					alu.dst.write = 1;
   2274 					alu.last = 1;
   2275 				} else
   2276 					alu.dst.write = 0;
   2277 
   2278 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   2279 				if (r)
   2280 					return r;
   2281 			}
   2282 		} else {
   2283 			/* tmp.z = log(tmp.x) */
   2284 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2285 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED);
   2286 			alu.src[0].sel = ctx->temp_reg;
   2287 			alu.src[0].chan = 0;
   2288 			alu.dst.sel = ctx->temp_reg;
   2289 			alu.dst.chan = 2;
   2290 			alu.dst.write = 1;
   2291 			alu.last = 1;
   2292 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   2293 			if (r)
   2294 				return r;
   2295 		}
   2296 
   2297 		chan = alu.dst.chan;
   2298 		sel = alu.dst.sel;
   2299 
   2300 		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
   2301 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2302 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT);
   2303 		alu.src[0].sel  = sel;
   2304 		alu.src[0].chan = chan;
   2305 		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
   2306 		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
   2307 		alu.dst.sel = ctx->temp_reg;
   2308 		alu.dst.chan = 0;
   2309 		alu.dst.write = 1;
   2310 		alu.is_op3 = 1;
   2311 		alu.last = 1;
   2312 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2313 		if (r)
   2314 			return r;
   2315 
   2316 		if (ctx->bc->chip_class == CAYMAN) {
   2317 			for (i = 0; i < 3; i++) {
   2318 				/* dst.z = exp(tmp.x) */
   2319 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2320 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   2321 				alu.src[0].sel = ctx->temp_reg;
   2322 				alu.src[0].chan = 0;
   2323 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   2324 				if (i == 2) {
   2325 					alu.dst.write = 1;
   2326 					alu.last = 1;
   2327 				} else
   2328 					alu.dst.write = 0;
   2329 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   2330 				if (r)
   2331 					return r;
   2332 			}
   2333 		} else {
   2334 			/* dst.z = exp(tmp.x) */
   2335 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2336 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   2337 			alu.src[0].sel = ctx->temp_reg;
   2338 			alu.src[0].chan = 0;
   2339 			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
   2340 			alu.last = 1;
   2341 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   2342 			if (r)
   2343 				return r;
   2344 		}
   2345 	}
   2346 
   2347 	/* dst.x, <- 1.0  */
   2348 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2349 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   2350 	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
   2351 	alu.src[0].chan = 0;
   2352 	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
   2353 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
   2354 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2355 	if (r)
   2356 		return r;
   2357 
   2358 	/* dst.y = max(src.x, 0.0) */
   2359 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2360 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX);
   2361 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   2362 	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
   2363 	alu.src[1].chan = 0;
   2364 	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
   2365 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
   2366 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2367 	if (r)
   2368 		return r;
   2369 
   2370 	/* dst.w, <- 1.0  */
   2371 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2372 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   2373 	alu.src[0].sel  = V_SQ_ALU_SRC_1;
   2374 	alu.src[0].chan = 0;
   2375 	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
   2376 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
   2377 	alu.last = 1;
   2378 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2379 	if (r)
   2380 		return r;
   2381 
   2382 	return 0;
   2383 }
   2384 
   2385 static int tgsi_rsq(struct r600_shader_ctx *ctx)
   2386 {
   2387 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2388 	struct r600_bytecode_alu alu;
   2389 	int i, r;
   2390 
   2391 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2392 
   2393 	/* XXX:
   2394 	 * For state trackers other than OpenGL, we'll want to use
   2395 	 * _RECIPSQRT_IEEE instead.
   2396 	 */
   2397 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED);
   2398 
   2399 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
   2400 		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
   2401 		r600_bytecode_src_set_abs(&alu.src[i]);
   2402 	}
   2403 	alu.dst.sel = ctx->temp_reg;
   2404 	alu.dst.write = 1;
   2405 	alu.last = 1;
   2406 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2407 	if (r)
   2408 		return r;
   2409 	/* replicate result */
   2410 	return tgsi_helper_tempx_replicate(ctx);
   2411 }
   2412 
   2413 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
   2414 {
   2415 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2416 	struct r600_bytecode_alu alu;
   2417 	int i, r;
   2418 
   2419 	for (i = 0; i < 4; i++) {
   2420 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2421 		alu.src[0].sel = ctx->temp_reg;
   2422 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   2423 		alu.dst.chan = i;
   2424 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   2425 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   2426 		if (i == 3)
   2427 			alu.last = 1;
   2428 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2429 		if (r)
   2430 			return r;
   2431 	}
   2432 	return 0;
   2433 }
   2434 
   2435 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
   2436 {
   2437 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2438 	struct r600_bytecode_alu alu;
   2439 	int i, r;
   2440 
   2441 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2442 	alu.inst = ctx->inst_info->r600_opcode;
   2443 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
   2444 		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
   2445 	}
   2446 	alu.dst.sel = ctx->temp_reg;
   2447 	alu.dst.write = 1;
   2448 	alu.last = 1;
   2449 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2450 	if (r)
   2451 		return r;
   2452 	/* replicate result */
   2453 	return tgsi_helper_tempx_replicate(ctx);
   2454 }
   2455 
   2456 static int cayman_pow(struct r600_shader_ctx *ctx)
   2457 {
   2458 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2459 	int i, r;
   2460 	struct r600_bytecode_alu alu;
   2461 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
   2462 
   2463 	for (i = 0; i < 3; i++) {
   2464 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2465 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
   2466 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   2467 		alu.dst.sel = ctx->temp_reg;
   2468 		alu.dst.chan = i;
   2469 		alu.dst.write = 1;
   2470 		if (i == 2)
   2471 			alu.last = 1;
   2472 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2473 		if (r)
   2474 			return r;
   2475 	}
   2476 
   2477 	/* b * LOG2(a) */
   2478 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2479 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
   2480 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
   2481 	alu.src[1].sel = ctx->temp_reg;
   2482 	alu.dst.sel = ctx->temp_reg;
   2483 	alu.dst.write = 1;
   2484 	alu.last = 1;
   2485 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2486 	if (r)
   2487 		return r;
   2488 
   2489 	for (i = 0; i < last_slot; i++) {
   2490 		/* POW(a,b) = EXP2(b * LOG2(a))*/
   2491 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2492 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   2493 		alu.src[0].sel = ctx->temp_reg;
   2494 
   2495 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   2496 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   2497 		if (i == last_slot - 1)
   2498 			alu.last = 1;
   2499 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2500 		if (r)
   2501 			return r;
   2502 	}
   2503 	return 0;
   2504 }
   2505 
   2506 static int tgsi_pow(struct r600_shader_ctx *ctx)
   2507 {
   2508 	struct r600_bytecode_alu alu;
   2509 	int r;
   2510 
   2511 	/* LOG2(a) */
   2512 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2513 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
   2514 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   2515 	alu.dst.sel = ctx->temp_reg;
   2516 	alu.dst.write = 1;
   2517 	alu.last = 1;
   2518 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2519 	if (r)
   2520 		return r;
   2521 	/* b * LOG2(a) */
   2522 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2523 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
   2524 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
   2525 	alu.src[1].sel = ctx->temp_reg;
   2526 	alu.dst.sel = ctx->temp_reg;
   2527 	alu.dst.write = 1;
   2528 	alu.last = 1;
   2529 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2530 	if (r)
   2531 		return r;
   2532 	/* POW(a,b) = EXP2(b * LOG2(a))*/
   2533 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2534 	alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   2535 	alu.src[0].sel = ctx->temp_reg;
   2536 	alu.dst.sel = ctx->temp_reg;
   2537 	alu.dst.write = 1;
   2538 	alu.last = 1;
   2539 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   2540 	if (r)
   2541 		return r;
   2542 	return tgsi_helper_tempx_replicate(ctx);
   2543 }
   2544 
   2545 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
   2546 {
   2547 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2548 	struct r600_bytecode_alu alu;
   2549 	int i, r, j;
   2550 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   2551 	int tmp0 = ctx->temp_reg;
   2552 	int tmp1 = r600_get_temp(ctx);
   2553 	int tmp2 = r600_get_temp(ctx);
   2554 	int tmp3 = r600_get_temp(ctx);
   2555 	/* Unsigned path:
   2556 	 *
   2557 	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
   2558 	 *
   2559 	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
   2560 	 * 2. tmp0.z = lo (tmp0.x * src2)
   2561 	 * 3. tmp0.w = -tmp0.z
   2562 	 * 4. tmp0.y = hi (tmp0.x * src2)
   2563 	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
   2564 	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
   2565 	 * 7. tmp1.x = tmp0.x - tmp0.w
   2566 	 * 8. tmp1.y = tmp0.x + tmp0.w
   2567 	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
   2568 	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
   2569 	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
   2570 	 *
   2571 	 * 12. tmp0.w = src1 - tmp0.y       = r
   2572 	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
   2573 	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
   2574 	 *
   2575 	 * if DIV
   2576 	 *
   2577 	 *   15. tmp1.z = tmp0.z + 1			= q + 1
   2578 	 *   16. tmp1.w = tmp0.z - 1			= q - 1
   2579 	 *
   2580 	 * else MOD
   2581 	 *
   2582 	 *   15. tmp1.z = tmp0.w - src2			= r - src2
   2583 	 *   16. tmp1.w = tmp0.w + src2			= r + src2
   2584 	 *
   2585 	 * endif
   2586 	 *
   2587 	 * 17. tmp1.x = tmp1.x & tmp1.y
   2588 	 *
   2589 	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
   2590 	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
   2591 	 *
   2592 	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
   2593 	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
   2594 	 *
   2595 	 * Signed path:
   2596 	 *
   2597 	 * Same as unsigned, using abs values of the operands,
   2598 	 * and fixing the sign of the result in the end.
   2599 	 */
   2600 
   2601 	for (i = 0; i < 4; i++) {
   2602 		if (!(write_mask & (1<<i)))
   2603 			continue;
   2604 
   2605 		if (signed_op) {
   2606 
   2607 			/* tmp2.x = -src0 */
   2608 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2609 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   2610 
   2611 			alu.dst.sel = tmp2;
   2612 			alu.dst.chan = 0;
   2613 			alu.dst.write = 1;
   2614 
   2615 			alu.src[0].sel = V_SQ_ALU_SRC_0;
   2616 
   2617 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   2618 
   2619 			alu.last = 1;
   2620 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2621 				return r;
   2622 
   2623 			/* tmp2.y = -src1 */
   2624 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2625 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   2626 
   2627 			alu.dst.sel = tmp2;
   2628 			alu.dst.chan = 1;
   2629 			alu.dst.write = 1;
   2630 
   2631 			alu.src[0].sel = V_SQ_ALU_SRC_0;
   2632 
   2633 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   2634 
   2635 			alu.last = 1;
   2636 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2637 				return r;
   2638 
   2639 			/* tmp2.z sign bit is set if src0 and src2 signs are different */
   2640 			/* it will be a sign of the quotient */
   2641 			if (!mod) {
   2642 
   2643 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2644 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT);
   2645 
   2646 				alu.dst.sel = tmp2;
   2647 				alu.dst.chan = 2;
   2648 				alu.dst.write = 1;
   2649 
   2650 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   2651 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   2652 
   2653 				alu.last = 1;
   2654 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2655 					return r;
   2656 			}
   2657 
   2658 			/* tmp2.x = |src0| */
   2659 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2660 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
   2661 			alu.is_op3 = 1;
   2662 
   2663 			alu.dst.sel = tmp2;
   2664 			alu.dst.chan = 0;
   2665 			alu.dst.write = 1;
   2666 
   2667 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   2668 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   2669 			alu.src[2].sel = tmp2;
   2670 			alu.src[2].chan = 0;
   2671 
   2672 			alu.last = 1;
   2673 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2674 				return r;
   2675 
   2676 			/* tmp2.y = |src1| */
   2677 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2678 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
   2679 			alu.is_op3 = 1;
   2680 
   2681 			alu.dst.sel = tmp2;
   2682 			alu.dst.chan = 1;
   2683 			alu.dst.write = 1;
   2684 
   2685 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   2686 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   2687 			alu.src[2].sel = tmp2;
   2688 			alu.src[2].chan = 1;
   2689 
   2690 			alu.last = 1;
   2691 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2692 				return r;
   2693 
   2694 		}
   2695 
   2696 		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
   2697 		if (ctx->bc->chip_class == CAYMAN) {
   2698 			/* tmp3.x = u2f(src2) */
   2699 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2700 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT);
   2701 
   2702 			alu.dst.sel = tmp3;
   2703 			alu.dst.chan = 0;
   2704 			alu.dst.write = 1;
   2705 
   2706 			if (signed_op) {
   2707 				alu.src[0].sel = tmp2;
   2708 				alu.src[0].chan = 1;
   2709 			} else {
   2710 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   2711 			}
   2712 
   2713 			alu.last = 1;
   2714 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2715 				return r;
   2716 
   2717 			/* tmp0.x = recip(tmp3.x) */
   2718 			for (j = 0 ; j < 3; j++) {
   2719 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2720 				alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE;
   2721 
   2722 				alu.dst.sel = tmp0;
   2723 				alu.dst.chan = j;
   2724 				alu.dst.write = (j == 0);
   2725 
   2726 				alu.src[0].sel = tmp3;
   2727 				alu.src[0].chan = 0;
   2728 
   2729 				if (j == 2)
   2730 					alu.last = 1;
   2731 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2732 					return r;
   2733 			}
   2734 
   2735 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2736 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
   2737 
   2738 			alu.src[0].sel = tmp0;
   2739 			alu.src[0].chan = 0;
   2740 
   2741 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   2742 			alu.src[1].value = 0x4f800000;
   2743 
   2744 			alu.dst.sel = tmp3;
   2745 			alu.dst.write = 1;
   2746 			alu.last = 1;
   2747 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   2748 			if (r)
   2749 				return r;
   2750 
   2751 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2752 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT);
   2753 
   2754 			alu.dst.sel = tmp0;
   2755 			alu.dst.chan = 0;
   2756 			alu.dst.write = 1;
   2757 
   2758 			alu.src[0].sel = tmp3;
   2759 			alu.src[0].chan = 0;
   2760 
   2761 			alu.last = 1;
   2762 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2763 				return r;
   2764 
   2765 		} else {
   2766 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2767 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT);
   2768 
   2769 			alu.dst.sel = tmp0;
   2770 			alu.dst.chan = 0;
   2771 			alu.dst.write = 1;
   2772 
   2773 			if (signed_op) {
   2774 				alu.src[0].sel = tmp2;
   2775 				alu.src[0].chan = 1;
   2776 			} else {
   2777 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   2778 			}
   2779 
   2780 			alu.last = 1;
   2781 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2782 				return r;
   2783 		}
   2784 
   2785 		/* 2. tmp0.z = lo (tmp0.x * src2) */
   2786 		if (ctx->bc->chip_class == CAYMAN) {
   2787 			for (j = 0 ; j < 4; j++) {
   2788 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2789 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
   2790 
   2791 				alu.dst.sel = tmp0;
   2792 				alu.dst.chan = j;
   2793 				alu.dst.write = (j == 2);
   2794 
   2795 				alu.src[0].sel = tmp0;
   2796 				alu.src[0].chan = 0;
   2797 				if (signed_op) {
   2798 					alu.src[1].sel = tmp2;
   2799 					alu.src[1].chan = 1;
   2800 				} else {
   2801 					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   2802 				}
   2803 
   2804 				alu.last = (j == 3);
   2805 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2806 					return r;
   2807 			}
   2808 		} else {
   2809 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2810 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
   2811 
   2812 			alu.dst.sel = tmp0;
   2813 			alu.dst.chan = 2;
   2814 			alu.dst.write = 1;
   2815 
   2816 			alu.src[0].sel = tmp0;
   2817 			alu.src[0].chan = 0;
   2818 			if (signed_op) {
   2819 				alu.src[1].sel = tmp2;
   2820 				alu.src[1].chan = 1;
   2821 			} else {
   2822 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   2823 			}
   2824 
   2825 			alu.last = 1;
   2826 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2827 				return r;
   2828 		}
   2829 
   2830 		/* 3. tmp0.w = -tmp0.z */
   2831 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2832 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   2833 
   2834 		alu.dst.sel = tmp0;
   2835 		alu.dst.chan = 3;
   2836 		alu.dst.write = 1;
   2837 
   2838 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   2839 		alu.src[1].sel = tmp0;
   2840 		alu.src[1].chan = 2;
   2841 
   2842 		alu.last = 1;
   2843 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2844 			return r;
   2845 
   2846 		/* 4. tmp0.y = hi (tmp0.x * src2) */
   2847 		if (ctx->bc->chip_class == CAYMAN) {
   2848 			for (j = 0 ; j < 4; j++) {
   2849 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2850 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
   2851 
   2852 				alu.dst.sel = tmp0;
   2853 				alu.dst.chan = j;
   2854 				alu.dst.write = (j == 1);
   2855 
   2856 				alu.src[0].sel = tmp0;
   2857 				alu.src[0].chan = 0;
   2858 
   2859 				if (signed_op) {
   2860 					alu.src[1].sel = tmp2;
   2861 					alu.src[1].chan = 1;
   2862 				} else {
   2863 					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   2864 				}
   2865 				alu.last = (j == 3);
   2866 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2867 					return r;
   2868 			}
   2869 		} else {
   2870 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2871 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
   2872 
   2873 			alu.dst.sel = tmp0;
   2874 			alu.dst.chan = 1;
   2875 			alu.dst.write = 1;
   2876 
   2877 			alu.src[0].sel = tmp0;
   2878 			alu.src[0].chan = 0;
   2879 
   2880 			if (signed_op) {
   2881 				alu.src[1].sel = tmp2;
   2882 				alu.src[1].chan = 1;
   2883 			} else {
   2884 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   2885 			}
   2886 
   2887 			alu.last = 1;
   2888 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2889 				return r;
   2890 		}
   2891 
   2892 		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
   2893 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2894 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
   2895 		alu.is_op3 = 1;
   2896 
   2897 		alu.dst.sel = tmp0;
   2898 		alu.dst.chan = 2;
   2899 		alu.dst.write = 1;
   2900 
   2901 		alu.src[0].sel = tmp0;
   2902 		alu.src[0].chan = 1;
   2903 		alu.src[1].sel = tmp0;
   2904 		alu.src[1].chan = 3;
   2905 		alu.src[2].sel = tmp0;
   2906 		alu.src[2].chan = 2;
   2907 
   2908 		alu.last = 1;
   2909 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2910 			return r;
   2911 
   2912 		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
   2913 		if (ctx->bc->chip_class == CAYMAN) {
   2914 			for (j = 0 ; j < 4; j++) {
   2915 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2916 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
   2917 
   2918 				alu.dst.sel = tmp0;
   2919 				alu.dst.chan = j;
   2920 				alu.dst.write = (j == 3);
   2921 
   2922 				alu.src[0].sel = tmp0;
   2923 				alu.src[0].chan = 2;
   2924 
   2925 				alu.src[1].sel = tmp0;
   2926 				alu.src[1].chan = 0;
   2927 
   2928 				alu.last = (j == 3);
   2929 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2930 					return r;
   2931 			}
   2932 		} else {
   2933 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2934 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
   2935 
   2936 			alu.dst.sel = tmp0;
   2937 			alu.dst.chan = 3;
   2938 			alu.dst.write = 1;
   2939 
   2940 			alu.src[0].sel = tmp0;
   2941 			alu.src[0].chan = 2;
   2942 
   2943 			alu.src[1].sel = tmp0;
   2944 			alu.src[1].chan = 0;
   2945 
   2946 			alu.last = 1;
   2947 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2948 				return r;
   2949 		}
   2950 
   2951 		/* 7. tmp1.x = tmp0.x - tmp0.w */
   2952 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2953 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   2954 
   2955 		alu.dst.sel = tmp1;
   2956 		alu.dst.chan = 0;
   2957 		alu.dst.write = 1;
   2958 
   2959 		alu.src[0].sel = tmp0;
   2960 		alu.src[0].chan = 0;
   2961 		alu.src[1].sel = tmp0;
   2962 		alu.src[1].chan = 3;
   2963 
   2964 		alu.last = 1;
   2965 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2966 			return r;
   2967 
   2968 		/* 8. tmp1.y = tmp0.x + tmp0.w */
   2969 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2970 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
   2971 
   2972 		alu.dst.sel = tmp1;
   2973 		alu.dst.chan = 1;
   2974 		alu.dst.write = 1;
   2975 
   2976 		alu.src[0].sel = tmp0;
   2977 		alu.src[0].chan = 0;
   2978 		alu.src[1].sel = tmp0;
   2979 		alu.src[1].chan = 3;
   2980 
   2981 		alu.last = 1;
   2982 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   2983 			return r;
   2984 
   2985 		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
   2986 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2987 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
   2988 		alu.is_op3 = 1;
   2989 
   2990 		alu.dst.sel = tmp0;
   2991 		alu.dst.chan = 0;
   2992 		alu.dst.write = 1;
   2993 
   2994 		alu.src[0].sel = tmp0;
   2995 		alu.src[0].chan = 1;
   2996 		alu.src[1].sel = tmp1;
   2997 		alu.src[1].chan = 1;
   2998 		alu.src[2].sel = tmp1;
   2999 		alu.src[2].chan = 0;
   3000 
   3001 		alu.last = 1;
   3002 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3003 			return r;
   3004 
   3005 		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
   3006 		if (ctx->bc->chip_class == CAYMAN) {
   3007 			for (j = 0 ; j < 4; j++) {
   3008 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3009 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
   3010 
   3011 				alu.dst.sel = tmp0;
   3012 				alu.dst.chan = j;
   3013 				alu.dst.write = (j == 2);
   3014 
   3015 				alu.src[0].sel = tmp0;
   3016 				alu.src[0].chan = 0;
   3017 
   3018 				if (signed_op) {
   3019 					alu.src[1].sel = tmp2;
   3020 					alu.src[1].chan = 0;
   3021 				} else {
   3022 					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   3023 				}
   3024 
   3025 				alu.last = (j == 3);
   3026 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3027 					return r;
   3028 			}
   3029 		} else {
   3030 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3031 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
   3032 
   3033 			alu.dst.sel = tmp0;
   3034 			alu.dst.chan = 2;
   3035 			alu.dst.write = 1;
   3036 
   3037 			alu.src[0].sel = tmp0;
   3038 			alu.src[0].chan = 0;
   3039 
   3040 			if (signed_op) {
   3041 				alu.src[1].sel = tmp2;
   3042 				alu.src[1].chan = 0;
   3043 			} else {
   3044 				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   3045 			}
   3046 
   3047 			alu.last = 1;
   3048 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3049 				return r;
   3050 		}
   3051 
   3052 		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
   3053 		if (ctx->bc->chip_class == CAYMAN) {
   3054 			for (j = 0 ; j < 4; j++) {
   3055 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3056 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
   3057 
   3058 				alu.dst.sel = tmp0;
   3059 				alu.dst.chan = j;
   3060 				alu.dst.write = (j == 1);
   3061 
   3062 				if (signed_op) {
   3063 					alu.src[0].sel = tmp2;
   3064 					alu.src[0].chan = 1;
   3065 				} else {
   3066 					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   3067 				}
   3068 
   3069 				alu.src[1].sel = tmp0;
   3070 				alu.src[1].chan = 2;
   3071 
   3072 				alu.last = (j == 3);
   3073 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3074 					return r;
   3075 			}
   3076 		} else {
   3077 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3078 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
   3079 
   3080 			alu.dst.sel = tmp0;
   3081 			alu.dst.chan = 1;
   3082 			alu.dst.write = 1;
   3083 
   3084 			if (signed_op) {
   3085 				alu.src[0].sel = tmp2;
   3086 				alu.src[0].chan = 1;
   3087 			} else {
   3088 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   3089 			}
   3090 
   3091 			alu.src[1].sel = tmp0;
   3092 			alu.src[1].chan = 2;
   3093 
   3094 			alu.last = 1;
   3095 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3096 				return r;
   3097 		}
   3098 
   3099 		/* 12. tmp0.w = src1 - tmp0.y       = r */
   3100 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3101 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   3102 
   3103 		alu.dst.sel = tmp0;
   3104 		alu.dst.chan = 3;
   3105 		alu.dst.write = 1;
   3106 
   3107 		if (signed_op) {
   3108 			alu.src[0].sel = tmp2;
   3109 			alu.src[0].chan = 0;
   3110 		} else {
   3111 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   3112 		}
   3113 
   3114 		alu.src[1].sel = tmp0;
   3115 		alu.src[1].chan = 1;
   3116 
   3117 		alu.last = 1;
   3118 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3119 			return r;
   3120 
   3121 		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
   3122 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3123 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
   3124 
   3125 		alu.dst.sel = tmp1;
   3126 		alu.dst.chan = 0;
   3127 		alu.dst.write = 1;
   3128 
   3129 		alu.src[0].sel = tmp0;
   3130 		alu.src[0].chan = 3;
   3131 		if (signed_op) {
   3132 			alu.src[1].sel = tmp2;
   3133 			alu.src[1].chan = 1;
   3134 		} else {
   3135 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   3136 		}
   3137 
   3138 		alu.last = 1;
   3139 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3140 			return r;
   3141 
   3142 		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
   3143 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3144 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT);
   3145 
   3146 		alu.dst.sel = tmp1;
   3147 		alu.dst.chan = 1;
   3148 		alu.dst.write = 1;
   3149 
   3150 		if (signed_op) {
   3151 			alu.src[0].sel = tmp2;
   3152 			alu.src[0].chan = 0;
   3153 		} else {
   3154 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   3155 		}
   3156 
   3157 		alu.src[1].sel = tmp0;
   3158 		alu.src[1].chan = 1;
   3159 
   3160 		alu.last = 1;
   3161 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3162 			return r;
   3163 
   3164 		if (mod) { /* UMOD */
   3165 
   3166 			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
   3167 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3168 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   3169 
   3170 			alu.dst.sel = tmp1;
   3171 			alu.dst.chan = 2;
   3172 			alu.dst.write = 1;
   3173 
   3174 			alu.src[0].sel = tmp0;
   3175 			alu.src[0].chan = 3;
   3176 
   3177 			if (signed_op) {
   3178 				alu.src[1].sel = tmp2;
   3179 				alu.src[1].chan = 1;
   3180 			} else {
   3181 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   3182 			}
   3183 
   3184 			alu.last = 1;
   3185 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3186 				return r;
   3187 
   3188 			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
   3189 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3190 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
   3191 
   3192 			alu.dst.sel = tmp1;
   3193 			alu.dst.chan = 3;
   3194 			alu.dst.write = 1;
   3195 
   3196 			alu.src[0].sel = tmp0;
   3197 			alu.src[0].chan = 3;
   3198 			if (signed_op) {
   3199 				alu.src[1].sel = tmp2;
   3200 				alu.src[1].chan = 1;
   3201 			} else {
   3202 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   3203 			}
   3204 
   3205 			alu.last = 1;
   3206 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3207 				return r;
   3208 
   3209 		} else { /* UDIV */
   3210 
   3211 			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
   3212 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3213 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
   3214 
   3215 			alu.dst.sel = tmp1;
   3216 			alu.dst.chan = 2;
   3217 			alu.dst.write = 1;
   3218 
   3219 			alu.src[0].sel = tmp0;
   3220 			alu.src[0].chan = 2;
   3221 			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
   3222 
   3223 			alu.last = 1;
   3224 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3225 				return r;
   3226 
   3227 			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
   3228 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3229 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
   3230 
   3231 			alu.dst.sel = tmp1;
   3232 			alu.dst.chan = 3;
   3233 			alu.dst.write = 1;
   3234 
   3235 			alu.src[0].sel = tmp0;
   3236 			alu.src[0].chan = 2;
   3237 			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
   3238 
   3239 			alu.last = 1;
   3240 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3241 				return r;
   3242 
   3243 		}
   3244 
   3245 		/* 17. tmp1.x = tmp1.x & tmp1.y */
   3246 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3247 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT);
   3248 
   3249 		alu.dst.sel = tmp1;
   3250 		alu.dst.chan = 0;
   3251 		alu.dst.write = 1;
   3252 
   3253 		alu.src[0].sel = tmp1;
   3254 		alu.src[0].chan = 0;
   3255 		alu.src[1].sel = tmp1;
   3256 		alu.src[1].chan = 1;
   3257 
   3258 		alu.last = 1;
   3259 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3260 			return r;
   3261 
   3262 		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
   3263 		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
   3264 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3265 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
   3266 		alu.is_op3 = 1;
   3267 
   3268 		alu.dst.sel = tmp0;
   3269 		alu.dst.chan = 2;
   3270 		alu.dst.write = 1;
   3271 
   3272 		alu.src[0].sel = tmp1;
   3273 		alu.src[0].chan = 0;
   3274 		alu.src[1].sel = tmp0;
   3275 		alu.src[1].chan = mod ? 3 : 2;
   3276 		alu.src[2].sel = tmp1;
   3277 		alu.src[2].chan = 2;
   3278 
   3279 		alu.last = 1;
   3280 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3281 			return r;
   3282 
   3283 		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
   3284 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3285 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDE_INT);
   3286 		alu.is_op3 = 1;
   3287 
   3288 		if (signed_op) {
   3289 			alu.dst.sel = tmp0;
   3290 			alu.dst.chan = 2;
   3291 			alu.dst.write = 1;
   3292 		} else {
   3293 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3294 		}
   3295 
   3296 		alu.src[0].sel = tmp1;
   3297 		alu.src[0].chan = 1;
   3298 		alu.src[1].sel = tmp1;
   3299 		alu.src[1].chan = 3;
   3300 		alu.src[2].sel = tmp0;
   3301 		alu.src[2].chan = 2;
   3302 
   3303 		alu.last = 1;
   3304 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3305 			return r;
   3306 
   3307 		if (signed_op) {
   3308 
   3309 			/* fix the sign of the result */
   3310 
   3311 			if (mod) {
   3312 
   3313 				/* tmp0.x = -tmp0.z */
   3314 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3315 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   3316 
   3317 				alu.dst.sel = tmp0;
   3318 				alu.dst.chan = 0;
   3319 				alu.dst.write = 1;
   3320 
   3321 				alu.src[0].sel = V_SQ_ALU_SRC_0;
   3322 				alu.src[1].sel = tmp0;
   3323 				alu.src[1].chan = 2;
   3324 
   3325 				alu.last = 1;
   3326 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3327 					return r;
   3328 
   3329 				/* sign of the remainder is the same as the sign of src0 */
   3330 				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
   3331 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3332 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
   3333 				alu.is_op3 = 1;
   3334 
   3335 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3336 
   3337 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   3338 				alu.src[1].sel = tmp0;
   3339 				alu.src[1].chan = 2;
   3340 				alu.src[2].sel = tmp0;
   3341 				alu.src[2].chan = 0;
   3342 
   3343 				alu.last = 1;
   3344 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3345 					return r;
   3346 
   3347 			} else {
   3348 
   3349 				/* tmp0.x = -tmp0.z */
   3350 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3351 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   3352 
   3353 				alu.dst.sel = tmp0;
   3354 				alu.dst.chan = 0;
   3355 				alu.dst.write = 1;
   3356 
   3357 				alu.src[0].sel = V_SQ_ALU_SRC_0;
   3358 				alu.src[1].sel = tmp0;
   3359 				alu.src[1].chan = 2;
   3360 
   3361 				alu.last = 1;
   3362 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3363 					return r;
   3364 
   3365 				/* fix the quotient sign (same as the sign of src0*src1) */
   3366 				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
   3367 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3368 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
   3369 				alu.is_op3 = 1;
   3370 
   3371 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3372 
   3373 				alu.src[0].sel = tmp2;
   3374 				alu.src[0].chan = 2;
   3375 				alu.src[1].sel = tmp0;
   3376 				alu.src[1].chan = 2;
   3377 				alu.src[2].sel = tmp0;
   3378 				alu.src[2].chan = 0;
   3379 
   3380 				alu.last = 1;
   3381 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   3382 					return r;
   3383 			}
   3384 		}
   3385 	}
   3386 	return 0;
   3387 }
   3388 
   3389 static int tgsi_udiv(struct r600_shader_ctx *ctx)
   3390 {
   3391 	return tgsi_divmod(ctx, 0, 0);
   3392 }
   3393 
   3394 static int tgsi_umod(struct r600_shader_ctx *ctx)
   3395 {
   3396 	return tgsi_divmod(ctx, 1, 0);
   3397 }
   3398 
   3399 static int tgsi_idiv(struct r600_shader_ctx *ctx)
   3400 {
   3401 	return tgsi_divmod(ctx, 0, 1);
   3402 }
   3403 
   3404 static int tgsi_imod(struct r600_shader_ctx *ctx)
   3405 {
   3406 	return tgsi_divmod(ctx, 1, 1);
   3407 }
   3408 
   3409 
   3410 static int tgsi_f2i(struct r600_shader_ctx *ctx)
   3411 {
   3412 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3413 	struct r600_bytecode_alu alu;
   3414 	int i, r;
   3415 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   3416 	int last_inst = tgsi_last_instruction(write_mask);
   3417 
   3418 	for (i = 0; i < 4; i++) {
   3419 		if (!(write_mask & (1<<i)))
   3420 			continue;
   3421 
   3422 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3423 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
   3424 
   3425 		alu.dst.sel = ctx->temp_reg;
   3426 		alu.dst.chan = i;
   3427 		alu.dst.write = 1;
   3428 
   3429 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   3430 		if (i == last_inst)
   3431 			alu.last = 1;
   3432 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3433 		if (r)
   3434 			return r;
   3435 	}
   3436 
   3437 	for (i = 0; i < 4; i++) {
   3438 		if (!(write_mask & (1<<i)))
   3439 			continue;
   3440 
   3441 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3442 		alu.inst = ctx->inst_info->r600_opcode;
   3443 
   3444 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3445 
   3446 		alu.src[0].sel = ctx->temp_reg;
   3447 		alu.src[0].chan = i;
   3448 
   3449 		if (i == last_inst || alu.inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT)
   3450 			alu.last = 1;
   3451 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3452 		if (r)
   3453 			return r;
   3454 	}
   3455 
   3456 	return 0;
   3457 }
   3458 
   3459 static int tgsi_iabs(struct r600_shader_ctx *ctx)
   3460 {
   3461 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3462 	struct r600_bytecode_alu alu;
   3463 	int i, r;
   3464 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   3465 	int last_inst = tgsi_last_instruction(write_mask);
   3466 
   3467 	/* tmp = -src */
   3468 	for (i = 0; i < 4; i++) {
   3469 		if (!(write_mask & (1<<i)))
   3470 			continue;
   3471 
   3472 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3473 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT);
   3474 
   3475 		alu.dst.sel = ctx->temp_reg;
   3476 		alu.dst.chan = i;
   3477 		alu.dst.write = 1;
   3478 
   3479 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   3480 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   3481 
   3482 		if (i == last_inst)
   3483 			alu.last = 1;
   3484 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3485 		if (r)
   3486 			return r;
   3487 	}
   3488 
   3489 	/* dst = (src >= 0 ? src : tmp) */
   3490 	for (i = 0; i < 4; i++) {
   3491 		if (!(write_mask & (1<<i)))
   3492 			continue;
   3493 
   3494 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3495 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
   3496 		alu.is_op3 = 1;
   3497 		alu.dst.write = 1;
   3498 
   3499 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3500 
   3501 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   3502 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   3503 		alu.src[2].sel = ctx->temp_reg;
   3504 		alu.src[2].chan = i;
   3505 
   3506 		if (i == last_inst)
   3507 			alu.last = 1;
   3508 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3509 		if (r)
   3510 			return r;
   3511 	}
   3512 	return 0;
   3513 }
   3514 
   3515 static int tgsi_issg(struct r600_shader_ctx *ctx)
   3516 {
   3517 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3518 	struct r600_bytecode_alu alu;
   3519 	int i, r;
   3520 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   3521 	int last_inst = tgsi_last_instruction(write_mask);
   3522 
   3523 	/* tmp = (src >= 0 ? src : -1) */
   3524 	for (i = 0; i < 4; i++) {
   3525 		if (!(write_mask & (1<<i)))
   3526 			continue;
   3527 
   3528 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3529 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE_INT);
   3530 		alu.is_op3 = 1;
   3531 
   3532 		alu.dst.sel = ctx->temp_reg;
   3533 		alu.dst.chan = i;
   3534 		alu.dst.write = 1;
   3535 
   3536 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   3537 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   3538 		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
   3539 
   3540 		if (i == last_inst)
   3541 			alu.last = 1;
   3542 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3543 		if (r)
   3544 			return r;
   3545 	}
   3546 
   3547 	/* dst = (tmp > 0 ? 1 : tmp) */
   3548 	for (i = 0; i < 4; i++) {
   3549 		if (!(write_mask & (1<<i)))
   3550 			continue;
   3551 
   3552 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3553 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT_INT);
   3554 		alu.is_op3 = 1;
   3555 		alu.dst.write = 1;
   3556 
   3557 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3558 
   3559 		alu.src[0].sel = ctx->temp_reg;
   3560 		alu.src[0].chan = i;
   3561 
   3562 		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
   3563 
   3564 		alu.src[2].sel = ctx->temp_reg;
   3565 		alu.src[2].chan = i;
   3566 
   3567 		if (i == last_inst)
   3568 			alu.last = 1;
   3569 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3570 		if (r)
   3571 			return r;
   3572 	}
   3573 	return 0;
   3574 }
   3575 
   3576 
   3577 
   3578 static int tgsi_ssg(struct r600_shader_ctx *ctx)
   3579 {
   3580 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3581 	struct r600_bytecode_alu alu;
   3582 	int i, r;
   3583 
   3584 	/* tmp = (src > 0 ? 1 : src) */
   3585 	for (i = 0; i < 4; i++) {
   3586 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3587 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
   3588 		alu.is_op3 = 1;
   3589 
   3590 		alu.dst.sel = ctx->temp_reg;
   3591 		alu.dst.chan = i;
   3592 
   3593 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   3594 		alu.src[1].sel = V_SQ_ALU_SRC_1;
   3595 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
   3596 
   3597 		if (i == 3)
   3598 			alu.last = 1;
   3599 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3600 		if (r)
   3601 			return r;
   3602 	}
   3603 
   3604 	/* dst = (-tmp > 0 ? -1 : tmp) */
   3605 	for (i = 0; i < 4; i++) {
   3606 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3607 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT);
   3608 		alu.is_op3 = 1;
   3609 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3610 
   3611 		alu.src[0].sel = ctx->temp_reg;
   3612 		alu.src[0].chan = i;
   3613 		alu.src[0].neg = 1;
   3614 
   3615 		alu.src[1].sel = V_SQ_ALU_SRC_1;
   3616 		alu.src[1].neg = 1;
   3617 
   3618 		alu.src[2].sel = ctx->temp_reg;
   3619 		alu.src[2].chan = i;
   3620 
   3621 		if (i == 3)
   3622 			alu.last = 1;
   3623 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3624 		if (r)
   3625 			return r;
   3626 	}
   3627 	return 0;
   3628 }
   3629 
   3630 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
   3631 {
   3632 	struct r600_bytecode_alu alu;
   3633 	int i, r;
   3634 
   3635 	for (i = 0; i < 4; i++) {
   3636 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3637 		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
   3638 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP);
   3639 			alu.dst.chan = i;
   3640 		} else {
   3641 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   3642 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3643 			alu.src[0].sel = ctx->temp_reg;
   3644 			alu.src[0].chan = i;
   3645 		}
   3646 		if (i == 3) {
   3647 			alu.last = 1;
   3648 		}
   3649 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3650 		if (r)
   3651 			return r;
   3652 	}
   3653 	return 0;
   3654 }
   3655 
   3656 static int tgsi_op3(struct r600_shader_ctx *ctx)
   3657 {
   3658 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3659 	struct r600_bytecode_alu alu;
   3660 	int i, j, r;
   3661 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   3662 
   3663 	for (i = 0; i < lasti + 1; i++) {
   3664 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   3665 			continue;
   3666 
   3667 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3668 		alu.inst = ctx->inst_info->r600_opcode;
   3669 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   3670 			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
   3671 		}
   3672 
   3673 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3674 		alu.dst.chan = i;
   3675 		alu.dst.write = 1;
   3676 		alu.is_op3 = 1;
   3677 		if (i == lasti) {
   3678 			alu.last = 1;
   3679 		}
   3680 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3681 		if (r)
   3682 			return r;
   3683 	}
   3684 	return 0;
   3685 }
   3686 
   3687 static int tgsi_dp(struct r600_shader_ctx *ctx)
   3688 {
   3689 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3690 	struct r600_bytecode_alu alu;
   3691 	int i, j, r;
   3692 
   3693 	for (i = 0; i < 4; i++) {
   3694 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3695 		alu.inst = ctx->inst_info->r600_opcode;
   3696 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   3697 			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
   3698 		}
   3699 
   3700 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3701 		alu.dst.chan = i;
   3702 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   3703 		/* handle some special cases */
   3704 		switch (ctx->inst_info->tgsi_opcode) {
   3705 		case TGSI_OPCODE_DP2:
   3706 			if (i > 1) {
   3707 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
   3708 				alu.src[0].chan = alu.src[1].chan = 0;
   3709 			}
   3710 			break;
   3711 		case TGSI_OPCODE_DP3:
   3712 			if (i > 2) {
   3713 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
   3714 				alu.src[0].chan = alu.src[1].chan = 0;
   3715 			}
   3716 			break;
   3717 		case TGSI_OPCODE_DPH:
   3718 			if (i == 3) {
   3719 				alu.src[0].sel = V_SQ_ALU_SRC_1;
   3720 				alu.src[0].chan = 0;
   3721 				alu.src[0].neg = 0;
   3722 			}
   3723 			break;
   3724 		default:
   3725 			break;
   3726 		}
   3727 		if (i == 3) {
   3728 			alu.last = 1;
   3729 		}
   3730 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3731 		if (r)
   3732 			return r;
   3733 	}
   3734 	return 0;
   3735 }
   3736 
   3737 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
   3738 						    unsigned index)
   3739 {
   3740 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3741 	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
   3742 		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
   3743 		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
   3744 		ctx->src[index].neg || ctx->src[index].abs;
   3745 }
   3746 
   3747 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
   3748 					unsigned index)
   3749 {
   3750 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3751 	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
   3752 }
   3753 
   3754 static int tgsi_tex(struct r600_shader_ctx *ctx)
   3755 {
   3756 	static float one_point_five = 1.5f;
   3757 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3758 	struct r600_bytecode_tex tex;
   3759 	struct r600_bytecode_alu alu;
   3760 	unsigned src_gpr;
   3761 	int r, i, j;
   3762 	int opcode;
   3763 	/* Texture fetch instructions can only use gprs as source.
   3764 	 * Also they cannot negate the source or take the absolute value */
   3765 	const boolean src_requires_loading = inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
   3766                                              tgsi_tex_src_requires_loading(ctx, 0);
   3767 	boolean src_loaded = FALSE;
   3768 	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
   3769 	uint8_t offset_x = 0, offset_y = 0, offset_z = 0;
   3770 
   3771 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
   3772 
   3773 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
   3774 		/* get offset values */
   3775 		if (inst->Texture.NumOffsets) {
   3776 			assert(inst->Texture.NumOffsets == 1);
   3777 
   3778 			offset_x = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
   3779 			offset_y = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
   3780 			offset_z = ctx->literals[inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
   3781 		}
   3782 	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
   3783 		/* TGSI moves the sampler to src reg 3 for TXD */
   3784 		sampler_src_reg = 3;
   3785 
   3786 		for (i = 1; i < 3; i++) {
   3787 			/* set gradients h/v */
   3788 			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
   3789 			tex.inst = (i == 1) ? SQ_TEX_INST_SET_GRADIENTS_H :
   3790 				SQ_TEX_INST_SET_GRADIENTS_V;
   3791 			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
   3792 			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
   3793 
   3794 			if (tgsi_tex_src_requires_loading(ctx, i)) {
   3795 				tex.src_gpr = r600_get_temp(ctx);
   3796 				tex.src_sel_x = 0;
   3797 				tex.src_sel_y = 1;
   3798 				tex.src_sel_z = 2;
   3799 				tex.src_sel_w = 3;
   3800 
   3801 				for (j = 0; j < 4; j++) {
   3802 					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3803 					alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   3804                                         r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
   3805                                         alu.dst.sel = tex.src_gpr;
   3806                                         alu.dst.chan = j;
   3807                                         if (j == 3)
   3808                                                 alu.last = 1;
   3809                                         alu.dst.write = 1;
   3810                                         r = r600_bytecode_add_alu(ctx->bc, &alu);
   3811                                         if (r)
   3812                                                 return r;
   3813 				}
   3814 
   3815 			} else {
   3816 				tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
   3817 				tex.src_sel_x = ctx->src[i].swizzle[0];
   3818 				tex.src_sel_y = ctx->src[i].swizzle[1];
   3819 				tex.src_sel_z = ctx->src[i].swizzle[2];
   3820 				tex.src_sel_w = ctx->src[i].swizzle[3];
   3821 				tex.src_rel = ctx->src[i].rel;
   3822 			}
   3823 			tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
   3824 			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
   3825 			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
   3826 				tex.coord_type_x = 1;
   3827 				tex.coord_type_y = 1;
   3828 				tex.coord_type_z = 1;
   3829 				tex.coord_type_w = 1;
   3830 			}
   3831 			r = r600_bytecode_add_tex(ctx->bc, &tex);
   3832 			if (r)
   3833 				return r;
   3834 		}
   3835 	} else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
   3836 		int out_chan;
   3837 		/* Add perspective divide */
   3838 		if (ctx->bc->chip_class == CAYMAN) {
   3839 			out_chan = 2;
   3840 			for (i = 0; i < 3; i++) {
   3841 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3842 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
   3843 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   3844 
   3845 				alu.dst.sel = ctx->temp_reg;
   3846 				alu.dst.chan = i;
   3847 				if (i == 2)
   3848 					alu.last = 1;
   3849 				if (out_chan == i)
   3850 					alu.dst.write = 1;
   3851 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   3852 				if (r)
   3853 					return r;
   3854 			}
   3855 
   3856 		} else {
   3857 			out_chan = 3;
   3858 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3859 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
   3860 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   3861 
   3862 			alu.dst.sel = ctx->temp_reg;
   3863 			alu.dst.chan = out_chan;
   3864 			alu.last = 1;
   3865 			alu.dst.write = 1;
   3866 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   3867 			if (r)
   3868 				return r;
   3869 		}
   3870 
   3871 		for (i = 0; i < 3; i++) {
   3872 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3873 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
   3874 			alu.src[0].sel = ctx->temp_reg;
   3875 			alu.src[0].chan = out_chan;
   3876 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   3877 			alu.dst.sel = ctx->temp_reg;
   3878 			alu.dst.chan = i;
   3879 			alu.dst.write = 1;
   3880 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   3881 			if (r)
   3882 				return r;
   3883 		}
   3884 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3885 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   3886 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   3887 		alu.src[0].chan = 0;
   3888 		alu.dst.sel = ctx->temp_reg;
   3889 		alu.dst.chan = 3;
   3890 		alu.last = 1;
   3891 		alu.dst.write = 1;
   3892 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3893 		if (r)
   3894 			return r;
   3895 		src_loaded = TRUE;
   3896 		src_gpr = ctx->temp_reg;
   3897 	}
   3898 
   3899 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
   3900 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
   3901 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
   3902 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
   3903 
   3904 		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
   3905 		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
   3906 
   3907 		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
   3908 		for (i = 0; i < 4; i++) {
   3909 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3910 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE);
   3911 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
   3912 			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
   3913 			alu.dst.sel = ctx->temp_reg;
   3914 			alu.dst.chan = i;
   3915 			if (i == 3)
   3916 				alu.last = 1;
   3917 			alu.dst.write = 1;
   3918 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   3919 			if (r)
   3920 				return r;
   3921 		}
   3922 
   3923 		/* tmp1.z = RCP_e(|tmp1.z|) */
   3924 		if (ctx->bc->chip_class == CAYMAN) {
   3925 			for (i = 0; i < 3; i++) {
   3926 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3927 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
   3928 				alu.src[0].sel = ctx->temp_reg;
   3929 				alu.src[0].chan = 2;
   3930 				alu.src[0].abs = 1;
   3931 				alu.dst.sel = ctx->temp_reg;
   3932 				alu.dst.chan = i;
   3933 				if (i == 2)
   3934 					alu.dst.write = 1;
   3935 				if (i == 2)
   3936 					alu.last = 1;
   3937 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   3938 				if (r)
   3939 					return r;
   3940 			}
   3941 		} else {
   3942 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3943 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
   3944 			alu.src[0].sel = ctx->temp_reg;
   3945 			alu.src[0].chan = 2;
   3946 			alu.src[0].abs = 1;
   3947 			alu.dst.sel = ctx->temp_reg;
   3948 			alu.dst.chan = 2;
   3949 			alu.dst.write = 1;
   3950 			alu.last = 1;
   3951 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   3952 			if (r)
   3953 				return r;
   3954 		}
   3955 
   3956 		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
   3957 		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
   3958 		 * muladd has no writemask, have to use another temp
   3959 		 */
   3960 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3961 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
   3962 		alu.is_op3 = 1;
   3963 
   3964 		alu.src[0].sel = ctx->temp_reg;
   3965 		alu.src[0].chan = 0;
   3966 		alu.src[1].sel = ctx->temp_reg;
   3967 		alu.src[1].chan = 2;
   3968 
   3969 		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
   3970 		alu.src[2].chan = 0;
   3971 		alu.src[2].value = *(uint32_t *)&one_point_five;
   3972 
   3973 		alu.dst.sel = ctx->temp_reg;
   3974 		alu.dst.chan = 0;
   3975 		alu.dst.write = 1;
   3976 
   3977 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3978 		if (r)
   3979 			return r;
   3980 
   3981 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3982 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
   3983 		alu.is_op3 = 1;
   3984 
   3985 		alu.src[0].sel = ctx->temp_reg;
   3986 		alu.src[0].chan = 1;
   3987 		alu.src[1].sel = ctx->temp_reg;
   3988 		alu.src[1].chan = 2;
   3989 
   3990 		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
   3991 		alu.src[2].chan = 0;
   3992 		alu.src[2].value = *(uint32_t *)&one_point_five;
   3993 
   3994 		alu.dst.sel = ctx->temp_reg;
   3995 		alu.dst.chan = 1;
   3996 		alu.dst.write = 1;
   3997 
   3998 		alu.last = 1;
   3999 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4000 		if (r)
   4001 			return r;
   4002 		/* write initial W value into Z component */
   4003 		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
   4004 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4005 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   4006 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   4007 			alu.dst.sel = ctx->temp_reg;
   4008 			alu.dst.chan = 2;
   4009 			alu.dst.write = 1;
   4010 			alu.last = 1;
   4011 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4012 			if (r)
   4013 				return r;
   4014 		}
   4015 
   4016 		/* for cube forms of lod and bias we need to route the lod
   4017 		   value into Z */
   4018 		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
   4019 		    inst->Instruction.Opcode == TGSI_OPCODE_TXL) {
   4020 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4021 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   4022 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   4023 			alu.dst.sel = ctx->temp_reg;
   4024 			alu.dst.chan = 2;
   4025 			alu.last = 1;
   4026 			alu.dst.write = 1;
   4027 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4028 			if (r)
   4029 				return r;
   4030 		}
   4031 
   4032 		src_loaded = TRUE;
   4033 		src_gpr = ctx->temp_reg;
   4034 	}
   4035 
   4036 	if (src_requires_loading && !src_loaded) {
   4037 		for (i = 0; i < 4; i++) {
   4038 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4039 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   4040 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   4041 			alu.dst.sel = ctx->temp_reg;
   4042 			alu.dst.chan = i;
   4043 			if (i == 3)
   4044 				alu.last = 1;
   4045 			alu.dst.write = 1;
   4046 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4047 			if (r)
   4048 				return r;
   4049 		}
   4050 		src_loaded = TRUE;
   4051 		src_gpr = ctx->temp_reg;
   4052 	}
   4053 
   4054 	opcode = ctx->inst_info->r600_opcode;
   4055 	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
   4056 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
   4057 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
   4058 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
   4059 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
   4060 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
   4061 		switch (opcode) {
   4062 		case SQ_TEX_INST_SAMPLE:
   4063 			opcode = SQ_TEX_INST_SAMPLE_C;
   4064 			break;
   4065 		case SQ_TEX_INST_SAMPLE_L:
   4066 			opcode = SQ_TEX_INST_SAMPLE_C_L;
   4067 			break;
   4068 		case SQ_TEX_INST_SAMPLE_LB:
   4069 			opcode = SQ_TEX_INST_SAMPLE_C_LB;
   4070 			break;
   4071 		case SQ_TEX_INST_SAMPLE_G:
   4072 			opcode = SQ_TEX_INST_SAMPLE_C_G;
   4073 			break;
   4074 		}
   4075 	}
   4076 
   4077 	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
   4078 	tex.inst = opcode;
   4079 
   4080 	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
   4081 	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
   4082 	tex.src_gpr = src_gpr;
   4083 	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
   4084 	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
   4085 	tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
   4086 	tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
   4087 	tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
   4088 
   4089 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
   4090 		tex.src_sel_x = 4;
   4091 		tex.src_sel_y = 4;
   4092 		tex.src_sel_z = 4;
   4093 		tex.src_sel_w = 4;
   4094 	} else if (src_loaded) {
   4095 		tex.src_sel_x = 0;
   4096 		tex.src_sel_y = 1;
   4097 		tex.src_sel_z = 2;
   4098 		tex.src_sel_w = 3;
   4099 	} else {
   4100 		tex.src_sel_x = ctx->src[0].swizzle[0];
   4101 		tex.src_sel_y = ctx->src[0].swizzle[1];
   4102 		tex.src_sel_z = ctx->src[0].swizzle[2];
   4103 		tex.src_sel_w = ctx->src[0].swizzle[3];
   4104 		tex.src_rel = ctx->src[0].rel;
   4105 	}
   4106 
   4107 	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
   4108 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) {
   4109 		tex.src_sel_x = 1;
   4110 		tex.src_sel_y = 0;
   4111 		tex.src_sel_z = 3;
   4112 		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
   4113 	}
   4114 
   4115 	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
   4116 	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
   4117 		tex.coord_type_x = 1;
   4118 		tex.coord_type_y = 1;
   4119 	}
   4120 	tex.coord_type_z = 1;
   4121 	tex.coord_type_w = 1;
   4122 
   4123 	tex.offset_x = offset_x;
   4124 	tex.offset_y = offset_y;
   4125 	tex.offset_z = offset_z;
   4126 
   4127 	/* Put the depth for comparison in W.
   4128 	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
   4129 	 * Some instructions expect the depth in Z. */
   4130 	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
   4131 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
   4132 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
   4133 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
   4134 	    opcode != SQ_TEX_INST_SAMPLE_C_L &&
   4135 	    opcode != SQ_TEX_INST_SAMPLE_C_LB) {
   4136 		tex.src_sel_w = tex.src_sel_z;
   4137 	}
   4138 
   4139 	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
   4140 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
   4141 		if (opcode == SQ_TEX_INST_SAMPLE_C_L ||
   4142 		    opcode == SQ_TEX_INST_SAMPLE_C_LB) {
   4143 			/* the array index is read from Y */
   4144 			tex.coord_type_y = 0;
   4145 		} else {
   4146 			/* the array index is read from Z */
   4147 			tex.coord_type_z = 0;
   4148 			tex.src_sel_z = tex.src_sel_y;
   4149 		}
   4150 	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
   4151 		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
   4152 		/* the array index is read from Z */
   4153 		tex.coord_type_z = 0;
   4154 
   4155 	r = r600_bytecode_add_tex(ctx->bc, &tex);
   4156 	if (r)
   4157 		return r;
   4158 
   4159 	/* add shadow ambient support  - gallium doesn't do it yet */
   4160 	return 0;
   4161 }
   4162 
   4163 static int tgsi_lrp(struct r600_shader_ctx *ctx)
   4164 {
   4165 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4166 	struct r600_bytecode_alu alu;
   4167 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4168 	unsigned i;
   4169 	int r;
   4170 
   4171 	/* optimize if it's just an equal balance */
   4172 	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
   4173 		for (i = 0; i < lasti + 1; i++) {
   4174 			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4175 				continue;
   4176 
   4177 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4178 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
   4179 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   4180 			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   4181 			alu.omod = 3;
   4182 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4183 			alu.dst.chan = i;
   4184 			if (i == lasti) {
   4185 				alu.last = 1;
   4186 			}
   4187 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4188 			if (r)
   4189 				return r;
   4190 		}
   4191 		return 0;
   4192 	}
   4193 
   4194 	/* 1 - src0 */
   4195 	for (i = 0; i < lasti + 1; i++) {
   4196 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4197 			continue;
   4198 
   4199 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4200 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD);
   4201 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   4202 		alu.src[0].chan = 0;
   4203 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   4204 		r600_bytecode_src_toggle_neg(&alu.src[1]);
   4205 		alu.dst.sel = ctx->temp_reg;
   4206 		alu.dst.chan = i;
   4207 		if (i == lasti) {
   4208 			alu.last = 1;
   4209 		}
   4210 		alu.dst.write = 1;
   4211 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4212 		if (r)
   4213 			return r;
   4214 	}
   4215 
   4216 	/* (1 - src0) * src2 */
   4217 	for (i = 0; i < lasti + 1; i++) {
   4218 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4219 			continue;
   4220 
   4221 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4222 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
   4223 		alu.src[0].sel = ctx->temp_reg;
   4224 		alu.src[0].chan = i;
   4225 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   4226 		alu.dst.sel = ctx->temp_reg;
   4227 		alu.dst.chan = i;
   4228 		if (i == lasti) {
   4229 			alu.last = 1;
   4230 		}
   4231 		alu.dst.write = 1;
   4232 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4233 		if (r)
   4234 			return r;
   4235 	}
   4236 
   4237 	/* src0 * src1 + (1 - src0) * src2 */
   4238 	for (i = 0; i < lasti + 1; i++) {
   4239 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4240 			continue;
   4241 
   4242 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4243 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
   4244 		alu.is_op3 = 1;
   4245 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   4246 		r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   4247 		alu.src[2].sel = ctx->temp_reg;
   4248 		alu.src[2].chan = i;
   4249 
   4250 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4251 		alu.dst.chan = i;
   4252 		if (i == lasti) {
   4253 			alu.last = 1;
   4254 		}
   4255 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4256 		if (r)
   4257 			return r;
   4258 	}
   4259 	return 0;
   4260 }
   4261 
   4262 static int tgsi_cmp(struct r600_shader_ctx *ctx)
   4263 {
   4264 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4265 	struct r600_bytecode_alu alu;
   4266 	int i, r;
   4267 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4268 
   4269 	for (i = 0; i < lasti + 1; i++) {
   4270 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4271 			continue;
   4272 
   4273 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4274 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE);
   4275 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   4276 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   4277 		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
   4278 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4279 		alu.dst.chan = i;
   4280 		alu.dst.write = 1;
   4281 		alu.is_op3 = 1;
   4282 		if (i == lasti)
   4283 			alu.last = 1;
   4284 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4285 		if (r)
   4286 			return r;
   4287 	}
   4288 	return 0;
   4289 }
   4290 
   4291 static int tgsi_xpd(struct r600_shader_ctx *ctx)
   4292 {
   4293 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4294 	static const unsigned int src0_swizzle[] = {2, 0, 1};
   4295 	static const unsigned int src1_swizzle[] = {1, 2, 0};
   4296 	struct r600_bytecode_alu alu;
   4297 	uint32_t use_temp = 0;
   4298 	int i, r;
   4299 
   4300 	if (inst->Dst[0].Register.WriteMask != 0xf)
   4301 		use_temp = 1;
   4302 
   4303 	for (i = 0; i < 4; i++) {
   4304 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4305 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
   4306 		if (i < 3) {
   4307 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
   4308 			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
   4309 		} else {
   4310 			alu.src[0].sel = V_SQ_ALU_SRC_0;
   4311 			alu.src[0].chan = i;
   4312 			alu.src[1].sel = V_SQ_ALU_SRC_0;
   4313 			alu.src[1].chan = i;
   4314 		}
   4315 
   4316 		alu.dst.sel = ctx->temp_reg;
   4317 		alu.dst.chan = i;
   4318 		alu.dst.write = 1;
   4319 
   4320 		if (i == 3)
   4321 			alu.last = 1;
   4322 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4323 		if (r)
   4324 			return r;
   4325 	}
   4326 
   4327 	for (i = 0; i < 4; i++) {
   4328 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4329 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD);
   4330 
   4331 		if (i < 3) {
   4332 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
   4333 			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
   4334 		} else {
   4335 			alu.src[0].sel = V_SQ_ALU_SRC_0;
   4336 			alu.src[0].chan = i;
   4337 			alu.src[1].sel = V_SQ_ALU_SRC_0;
   4338 			alu.src[1].chan = i;
   4339 		}
   4340 
   4341 		alu.src[2].sel = ctx->temp_reg;
   4342 		alu.src[2].neg = 1;
   4343 		alu.src[2].chan = i;
   4344 
   4345 		if (use_temp)
   4346 			alu.dst.sel = ctx->temp_reg;
   4347 		else
   4348 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4349 		alu.dst.chan = i;
   4350 		alu.dst.write = 1;
   4351 		alu.is_op3 = 1;
   4352 		if (i == 3)
   4353 			alu.last = 1;
   4354 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4355 		if (r)
   4356 			return r;
   4357 	}
   4358 	if (use_temp)
   4359 		return tgsi_helper_copy(ctx, inst);
   4360 	return 0;
   4361 }
   4362 
   4363 static int tgsi_exp(struct r600_shader_ctx *ctx)
   4364 {
   4365 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4366 	struct r600_bytecode_alu alu;
   4367 	int r;
   4368 	int i;
   4369 
   4370 	/* result.x = 2^floor(src); */
   4371 	if (inst->Dst[0].Register.WriteMask & 1) {
   4372 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4373 
   4374 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
   4375 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4376 
   4377 		alu.dst.sel = ctx->temp_reg;
   4378 		alu.dst.chan = 0;
   4379 		alu.dst.write = 1;
   4380 		alu.last = 1;
   4381 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4382 		if (r)
   4383 			return r;
   4384 
   4385 		if (ctx->bc->chip_class == CAYMAN) {
   4386 			for (i = 0; i < 3; i++) {
   4387 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   4388 				alu.src[0].sel = ctx->temp_reg;
   4389 				alu.src[0].chan = 0;
   4390 
   4391 				alu.dst.sel = ctx->temp_reg;
   4392 				alu.dst.chan = i;
   4393 				alu.dst.write = i == 0;
   4394 				alu.last = i == 2;
   4395 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4396 				if (r)
   4397 					return r;
   4398 			}
   4399 		} else {
   4400 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   4401 			alu.src[0].sel = ctx->temp_reg;
   4402 			alu.src[0].chan = 0;
   4403 
   4404 			alu.dst.sel = ctx->temp_reg;
   4405 			alu.dst.chan = 0;
   4406 			alu.dst.write = 1;
   4407 			alu.last = 1;
   4408 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4409 			if (r)
   4410 				return r;
   4411 		}
   4412 	}
   4413 
   4414 	/* result.y = tmp - floor(tmp); */
   4415 	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
   4416 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4417 
   4418 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT);
   4419 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4420 
   4421 		alu.dst.sel = ctx->temp_reg;
   4422 #if 0
   4423 		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4424 		if (r)
   4425 			return r;
   4426 #endif
   4427 		alu.dst.write = 1;
   4428 		alu.dst.chan = 1;
   4429 
   4430 		alu.last = 1;
   4431 
   4432 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4433 		if (r)
   4434 			return r;
   4435 	}
   4436 
   4437 	/* result.z = RoughApprox2ToX(tmp);*/
   4438 	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
   4439 		if (ctx->bc->chip_class == CAYMAN) {
   4440 			for (i = 0; i < 3; i++) {
   4441 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4442 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   4443 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4444 
   4445 				alu.dst.sel = ctx->temp_reg;
   4446 				alu.dst.chan = i;
   4447 				if (i == 2) {
   4448 					alu.dst.write = 1;
   4449 					alu.last = 1;
   4450 				}
   4451 
   4452 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4453 				if (r)
   4454 					return r;
   4455 			}
   4456 		} else {
   4457 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4458 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   4459 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4460 
   4461 			alu.dst.sel = ctx->temp_reg;
   4462 			alu.dst.write = 1;
   4463 			alu.dst.chan = 2;
   4464 
   4465 			alu.last = 1;
   4466 
   4467 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4468 			if (r)
   4469 				return r;
   4470 		}
   4471 	}
   4472 
   4473 	/* result.w = 1.0;*/
   4474 	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
   4475 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4476 
   4477 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   4478 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   4479 		alu.src[0].chan = 0;
   4480 
   4481 		alu.dst.sel = ctx->temp_reg;
   4482 		alu.dst.chan = 3;
   4483 		alu.dst.write = 1;
   4484 		alu.last = 1;
   4485 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4486 		if (r)
   4487 			return r;
   4488 	}
   4489 	return tgsi_helper_copy(ctx, inst);
   4490 }
   4491 
   4492 static int tgsi_log(struct r600_shader_ctx *ctx)
   4493 {
   4494 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4495 	struct r600_bytecode_alu alu;
   4496 	int r;
   4497 	int i;
   4498 
   4499 	/* result.x = floor(log2(|src|)); */
   4500 	if (inst->Dst[0].Register.WriteMask & 1) {
   4501 		if (ctx->bc->chip_class == CAYMAN) {
   4502 			for (i = 0; i < 3; i++) {
   4503 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4504 
   4505 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
   4506 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4507 				r600_bytecode_src_set_abs(&alu.src[0]);
   4508 
   4509 				alu.dst.sel = ctx->temp_reg;
   4510 				alu.dst.chan = i;
   4511 				if (i == 0)
   4512 					alu.dst.write = 1;
   4513 				if (i == 2)
   4514 					alu.last = 1;
   4515 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4516 				if (r)
   4517 					return r;
   4518 			}
   4519 
   4520 		} else {
   4521 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4522 
   4523 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
   4524 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4525 			r600_bytecode_src_set_abs(&alu.src[0]);
   4526 
   4527 			alu.dst.sel = ctx->temp_reg;
   4528 			alu.dst.chan = 0;
   4529 			alu.dst.write = 1;
   4530 			alu.last = 1;
   4531 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4532 			if (r)
   4533 				return r;
   4534 		}
   4535 
   4536 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
   4537 		alu.src[0].sel = ctx->temp_reg;
   4538 		alu.src[0].chan = 0;
   4539 
   4540 		alu.dst.sel = ctx->temp_reg;
   4541 		alu.dst.chan = 0;
   4542 		alu.dst.write = 1;
   4543 		alu.last = 1;
   4544 
   4545 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4546 		if (r)
   4547 			return r;
   4548 	}
   4549 
   4550 	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
   4551 	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
   4552 
   4553 		if (ctx->bc->chip_class == CAYMAN) {
   4554 			for (i = 0; i < 3; i++) {
   4555 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4556 
   4557 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
   4558 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4559 				r600_bytecode_src_set_abs(&alu.src[0]);
   4560 
   4561 				alu.dst.sel = ctx->temp_reg;
   4562 				alu.dst.chan = i;
   4563 				if (i == 1)
   4564 					alu.dst.write = 1;
   4565 				if (i == 2)
   4566 					alu.last = 1;
   4567 
   4568 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4569 				if (r)
   4570 					return r;
   4571 			}
   4572 		} else {
   4573 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4574 
   4575 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
   4576 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4577 			r600_bytecode_src_set_abs(&alu.src[0]);
   4578 
   4579 			alu.dst.sel = ctx->temp_reg;
   4580 			alu.dst.chan = 1;
   4581 			alu.dst.write = 1;
   4582 			alu.last = 1;
   4583 
   4584 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4585 			if (r)
   4586 				return r;
   4587 		}
   4588 
   4589 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4590 
   4591 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR);
   4592 		alu.src[0].sel = ctx->temp_reg;
   4593 		alu.src[0].chan = 1;
   4594 
   4595 		alu.dst.sel = ctx->temp_reg;
   4596 		alu.dst.chan = 1;
   4597 		alu.dst.write = 1;
   4598 		alu.last = 1;
   4599 
   4600 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4601 		if (r)
   4602 			return r;
   4603 
   4604 		if (ctx->bc->chip_class == CAYMAN) {
   4605 			for (i = 0; i < 3; i++) {
   4606 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4607 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   4608 				alu.src[0].sel = ctx->temp_reg;
   4609 				alu.src[0].chan = 1;
   4610 
   4611 				alu.dst.sel = ctx->temp_reg;
   4612 				alu.dst.chan = i;
   4613 				if (i == 1)
   4614 					alu.dst.write = 1;
   4615 				if (i == 2)
   4616 					alu.last = 1;
   4617 
   4618 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4619 				if (r)
   4620 					return r;
   4621 			}
   4622 		} else {
   4623 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4624 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE);
   4625 			alu.src[0].sel = ctx->temp_reg;
   4626 			alu.src[0].chan = 1;
   4627 
   4628 			alu.dst.sel = ctx->temp_reg;
   4629 			alu.dst.chan = 1;
   4630 			alu.dst.write = 1;
   4631 			alu.last = 1;
   4632 
   4633 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4634 			if (r)
   4635 				return r;
   4636 		}
   4637 
   4638 		if (ctx->bc->chip_class == CAYMAN) {
   4639 			for (i = 0; i < 3; i++) {
   4640 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4641 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
   4642 				alu.src[0].sel = ctx->temp_reg;
   4643 				alu.src[0].chan = 1;
   4644 
   4645 				alu.dst.sel = ctx->temp_reg;
   4646 				alu.dst.chan = i;
   4647 				if (i == 1)
   4648 					alu.dst.write = 1;
   4649 				if (i == 2)
   4650 					alu.last = 1;
   4651 
   4652 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4653 				if (r)
   4654 					return r;
   4655 			}
   4656 		} else {
   4657 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4658 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE);
   4659 			alu.src[0].sel = ctx->temp_reg;
   4660 			alu.src[0].chan = 1;
   4661 
   4662 			alu.dst.sel = ctx->temp_reg;
   4663 			alu.dst.chan = 1;
   4664 			alu.dst.write = 1;
   4665 			alu.last = 1;
   4666 
   4667 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4668 			if (r)
   4669 				return r;
   4670 		}
   4671 
   4672 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4673 
   4674 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
   4675 
   4676 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4677 		r600_bytecode_src_set_abs(&alu.src[0]);
   4678 
   4679 		alu.src[1].sel = ctx->temp_reg;
   4680 		alu.src[1].chan = 1;
   4681 
   4682 		alu.dst.sel = ctx->temp_reg;
   4683 		alu.dst.chan = 1;
   4684 		alu.dst.write = 1;
   4685 		alu.last = 1;
   4686 
   4687 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4688 		if (r)
   4689 			return r;
   4690 	}
   4691 
   4692 	/* result.z = log2(|src|);*/
   4693 	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
   4694 		if (ctx->bc->chip_class == CAYMAN) {
   4695 			for (i = 0; i < 3; i++) {
   4696 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4697 
   4698 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
   4699 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4700 				r600_bytecode_src_set_abs(&alu.src[0]);
   4701 
   4702 				alu.dst.sel = ctx->temp_reg;
   4703 				if (i == 2)
   4704 					alu.dst.write = 1;
   4705 				alu.dst.chan = i;
   4706 				if (i == 2)
   4707 					alu.last = 1;
   4708 
   4709 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4710 				if (r)
   4711 					return r;
   4712 			}
   4713 		} else {
   4714 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4715 
   4716 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE);
   4717 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4718 			r600_bytecode_src_set_abs(&alu.src[0]);
   4719 
   4720 			alu.dst.sel = ctx->temp_reg;
   4721 			alu.dst.write = 1;
   4722 			alu.dst.chan = 2;
   4723 			alu.last = 1;
   4724 
   4725 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4726 			if (r)
   4727 				return r;
   4728 		}
   4729 	}
   4730 
   4731 	/* result.w = 1.0; */
   4732 	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
   4733 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4734 
   4735 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
   4736 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   4737 		alu.src[0].chan = 0;
   4738 
   4739 		alu.dst.sel = ctx->temp_reg;
   4740 		alu.dst.chan = 3;
   4741 		alu.dst.write = 1;
   4742 		alu.last = 1;
   4743 
   4744 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4745 		if (r)
   4746 			return r;
   4747 	}
   4748 
   4749 	return tgsi_helper_copy(ctx, inst);
   4750 }
   4751 
   4752 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
   4753 {
   4754 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4755 	struct r600_bytecode_alu alu;
   4756 	int r;
   4757 
   4758 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4759 
   4760 	switch (inst->Instruction.Opcode) {
   4761 	case TGSI_OPCODE_ARL:
   4762 		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
   4763 		break;
   4764 	case TGSI_OPCODE_ARR:
   4765 		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
   4766 		break;
   4767 	case TGSI_OPCODE_UARL:
   4768 		alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
   4769 		break;
   4770 	default:
   4771 		assert(0);
   4772 		return -1;
   4773 	}
   4774 
   4775 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4776 	alu.last = 1;
   4777 	alu.dst.sel = ctx->bc->ar_reg;
   4778 	alu.dst.write = 1;
   4779 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4780 	if (r)
   4781 		return r;
   4782 
   4783 	ctx->bc->ar_loaded = 0;
   4784 	return 0;
   4785 }
   4786 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
   4787 {
   4788 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4789 	struct r600_bytecode_alu alu;
   4790 	int r;
   4791 
   4792 	switch (inst->Instruction.Opcode) {
   4793 	case TGSI_OPCODE_ARL:
   4794 		memset(&alu, 0, sizeof(alu));
   4795 		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR;
   4796 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4797 		alu.dst.sel = ctx->bc->ar_reg;
   4798 		alu.dst.write = 1;
   4799 		alu.last = 1;
   4800 
   4801 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   4802 			return r;
   4803 
   4804 		memset(&alu, 0, sizeof(alu));
   4805 		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
   4806 		alu.src[0].sel = ctx->bc->ar_reg;
   4807 		alu.dst.sel = ctx->bc->ar_reg;
   4808 		alu.dst.write = 1;
   4809 		alu.last = 1;
   4810 
   4811 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   4812 			return r;
   4813 		break;
   4814 	case TGSI_OPCODE_ARR:
   4815 		memset(&alu, 0, sizeof(alu));
   4816 		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT;
   4817 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4818 		alu.dst.sel = ctx->bc->ar_reg;
   4819 		alu.dst.write = 1;
   4820 		alu.last = 1;
   4821 
   4822 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   4823 			return r;
   4824 		break;
   4825 	case TGSI_OPCODE_UARL:
   4826 		memset(&alu, 0, sizeof(alu));
   4827 		alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV;
   4828 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4829 		alu.dst.sel = ctx->bc->ar_reg;
   4830 		alu.dst.write = 1;
   4831 		alu.last = 1;
   4832 
   4833 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   4834 			return r;
   4835 		break;
   4836 	default:
   4837 		assert(0);
   4838 		return -1;
   4839 	}
   4840 
   4841 	ctx->bc->ar_loaded = 0;
   4842 	return 0;
   4843 }
   4844 
   4845 static int tgsi_opdst(struct r600_shader_ctx *ctx)
   4846 {
   4847 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4848 	struct r600_bytecode_alu alu;
   4849 	int i, r = 0;
   4850 
   4851 	for (i = 0; i < 4; i++) {
   4852 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4853 
   4854 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
   4855 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4856 
   4857 		if (i == 0 || i == 3) {
   4858 			alu.src[0].sel = V_SQ_ALU_SRC_1;
   4859 		} else {
   4860 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   4861 		}
   4862 
   4863 		if (i == 0 || i == 2) {
   4864 			alu.src[1].sel = V_SQ_ALU_SRC_1;
   4865 		} else {
   4866 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   4867 		}
   4868 		if (i == 3)
   4869 			alu.last = 1;
   4870 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4871 		if (r)
   4872 			return r;
   4873 	}
   4874 	return 0;
   4875 }
   4876 
   4877 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode)
   4878 {
   4879 	struct r600_bytecode_alu alu;
   4880 	int r;
   4881 
   4882 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4883 	alu.inst = opcode;
   4884 	alu.execute_mask = 1;
   4885 	alu.update_pred = 1;
   4886 
   4887 	alu.dst.sel = ctx->temp_reg;
   4888 	alu.dst.write = 1;
   4889 	alu.dst.chan = 0;
   4890 
   4891 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4892 	alu.src[1].sel = V_SQ_ALU_SRC_0;
   4893 	alu.src[1].chan = 0;
   4894 
   4895 	alu.last = 1;
   4896 
   4897 	r = r600_bytecode_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE));
   4898 	if (r)
   4899 		return r;
   4900 	return 0;
   4901 }
   4902 
   4903 static int pops(struct r600_shader_ctx *ctx, int pops)
   4904 {
   4905 	unsigned force_pop = ctx->bc->force_add_cf;
   4906 
   4907 	if (!force_pop) {
   4908 		int alu_pop = 3;
   4909 		if (ctx->bc->cf_last) {
   4910 			if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU))
   4911 				alu_pop = 0;
   4912 			else if (ctx->bc->cf_last->inst == CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER))
   4913 				alu_pop = 1;
   4914 		}
   4915 		alu_pop += pops;
   4916 		if (alu_pop == 1) {
   4917 			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER);
   4918 			ctx->bc->force_add_cf = 1;
   4919 		} else if (alu_pop == 2) {
   4920 			ctx->bc->cf_last->inst = CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER);
   4921 			ctx->bc->force_add_cf = 1;
   4922 		} else {
   4923 			force_pop = 1;
   4924 		}
   4925 	}
   4926 
   4927 	if (force_pop) {
   4928 		r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_POP));
   4929 		ctx->bc->cf_last->pop_count = pops;
   4930 		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
   4931 	}
   4932 
   4933 	return 0;
   4934 }
   4935 
   4936 static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason)
   4937 {
   4938 	switch(reason) {
   4939 	case FC_PUSH_VPM:
   4940 		ctx->bc->callstack[ctx->bc->call_sp].current--;
   4941 		break;
   4942 	case FC_PUSH_WQM:
   4943 	case FC_LOOP:
   4944 		ctx->bc->callstack[ctx->bc->call_sp].current -= 4;
   4945 		break;
   4946 	case FC_REP:
   4947 		/* TOODO : for 16 vp asic should -= 2; */
   4948 		ctx->bc->callstack[ctx->bc->call_sp].current --;
   4949 		break;
   4950 	}
   4951 }
   4952 
   4953 static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only)
   4954 {
   4955 	if (check_max_only) {
   4956 		int diff;
   4957 		switch (reason) {
   4958 		case FC_PUSH_VPM:
   4959 			diff = 1;
   4960 			break;
   4961 		case FC_PUSH_WQM:
   4962 			diff = 4;
   4963 			break;
   4964 		default:
   4965 			assert(0);
   4966 			diff = 0;
   4967 		}
   4968 		if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) >
   4969 		    ctx->bc->callstack[ctx->bc->call_sp].max) {
   4970 			ctx->bc->callstack[ctx->bc->call_sp].max =
   4971 				ctx->bc->callstack[ctx->bc->call_sp].current + diff;
   4972 		}
   4973 		return;
   4974 	}
   4975 	switch (reason) {
   4976 	case FC_PUSH_VPM:
   4977 		ctx->bc->callstack[ctx->bc->call_sp].current++;
   4978 		break;
   4979 	case FC_PUSH_WQM:
   4980 	case FC_LOOP:
   4981 		ctx->bc->callstack[ctx->bc->call_sp].current += 4;
   4982 		break;
   4983 	case FC_REP:
   4984 		ctx->bc->callstack[ctx->bc->call_sp].current++;
   4985 		break;
   4986 	}
   4987 
   4988 	if ((ctx->bc->callstack[ctx->bc->call_sp].current) >
   4989 	    ctx->bc->callstack[ctx->bc->call_sp].max) {
   4990 		ctx->bc->callstack[ctx->bc->call_sp].max =
   4991 			ctx->bc->callstack[ctx->bc->call_sp].current;
   4992 	}
   4993 }
   4994 
   4995 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
   4996 {
   4997 	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
   4998 
   4999 	sp->mid = (struct r600_bytecode_cf **)realloc((void *)sp->mid,
   5000 						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
   5001 	sp->mid[sp->num_mid] = ctx->bc->cf_last;
   5002 	sp->num_mid++;
   5003 }
   5004 
   5005 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
   5006 {
   5007 	ctx->bc->fc_sp++;
   5008 	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
   5009 	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
   5010 }
   5011 
   5012 static void fc_poplevel(struct r600_shader_ctx *ctx)
   5013 {
   5014 	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
   5015 	if (sp->mid) {
   5016 		free(sp->mid);
   5017 		sp->mid = NULL;
   5018 	}
   5019 	sp->num_mid = 0;
   5020 	sp->start = NULL;
   5021 	sp->type = 0;
   5022 	ctx->bc->fc_sp--;
   5023 }
   5024 
   5025 #if 0
   5026 static int emit_return(struct r600_shader_ctx *ctx)
   5027 {
   5028 	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
   5029 	return 0;
   5030 }
   5031 
   5032 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
   5033 {
   5034 
   5035 	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
   5036 	ctx->bc->cf_last->pop_count = pops;
   5037 	/* XXX work out offset */
   5038 	return 0;
   5039 }
   5040 
   5041 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
   5042 {
   5043 	return 0;
   5044 }
   5045 
   5046 static void emit_testflag(struct r600_shader_ctx *ctx)
   5047 {
   5048 
   5049 }
   5050 
   5051 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
   5052 {
   5053 	emit_testflag(ctx);
   5054 	emit_jump_to_offset(ctx, 1, 4);
   5055 	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
   5056 	pops(ctx, ifidx + 1);
   5057 	emit_return(ctx);
   5058 }
   5059 
   5060 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
   5061 {
   5062 	emit_testflag(ctx);
   5063 
   5064 	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
   5065 	ctx->bc->cf_last->pop_count = 1;
   5066 
   5067 	fc_set_mid(ctx, fc_sp);
   5068 
   5069 	pops(ctx, 1);
   5070 }
   5071 #endif
   5072 
   5073 static int tgsi_if(struct r600_shader_ctx *ctx)
   5074 {
   5075 	emit_logic_pred(ctx, CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT));
   5076 
   5077 	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_JUMP));
   5078 
   5079 	fc_pushlevel(ctx, FC_IF);
   5080 
   5081 	callstack_check_depth(ctx, FC_PUSH_VPM, 0);
   5082 	return 0;
   5083 }
   5084 
   5085 static int tgsi_else(struct r600_shader_ctx *ctx)
   5086 {
   5087 	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_ELSE));
   5088 	ctx->bc->cf_last->pop_count = 1;
   5089 
   5090 	fc_set_mid(ctx, ctx->bc->fc_sp);
   5091 	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
   5092 	return 0;
   5093 }
   5094 
   5095 static int tgsi_endif(struct r600_shader_ctx *ctx)
   5096 {
   5097 	pops(ctx, 1);
   5098 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
   5099 		R600_ERR("if/endif unbalanced in shader\n");
   5100 		return -1;
   5101 	}
   5102 
   5103 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
   5104 		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
   5105 		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
   5106 	} else {
   5107 		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
   5108 	}
   5109 	fc_poplevel(ctx);
   5110 
   5111 	callstack_decrease_current(ctx, FC_PUSH_VPM);
   5112 	return 0;
   5113 }
   5114 
   5115 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
   5116 {
   5117 	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
   5118 	 * limited to 4096 iterations, like the other LOOP_* instructions. */
   5119 	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_DX10));
   5120 
   5121 	fc_pushlevel(ctx, FC_LOOP);
   5122 
   5123 	/* check stack depth */
   5124 	callstack_check_depth(ctx, FC_LOOP, 0);
   5125 	return 0;
   5126 }
   5127 
   5128 static int tgsi_endloop(struct r600_shader_ctx *ctx)
   5129 {
   5130 	int i;
   5131 
   5132 	r600_bytecode_add_cfinst(ctx->bc, CTX_INST(V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END));
   5133 
   5134 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
   5135 		R600_ERR("loop/endloop in shader code are not paired.\n");
   5136 		return -EINVAL;
   5137 	}
   5138 
   5139 	/* fixup loop pointers - from r600isa
   5140 	   LOOP END points to CF after LOOP START,
   5141 	   LOOP START point to CF after LOOP END
   5142 	   BRK/CONT point to LOOP END CF
   5143 	*/
   5144 	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
   5145 
   5146 	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
   5147 
   5148 	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
   5149 		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
   5150 	}
   5151 	/* XXX add LOOPRET support */
   5152 	fc_poplevel(ctx);
   5153 	callstack_decrease_current(ctx, FC_LOOP);
   5154 	return 0;
   5155 }
   5156 
   5157 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
   5158 {
   5159 	unsigned int fscp;
   5160 
   5161 	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
   5162 	{
   5163 		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
   5164 			break;
   5165 	}
   5166 
   5167 	if (fscp == 0) {
   5168 		R600_ERR("Break not inside loop/endloop pair\n");
   5169 		return -EINVAL;
   5170 	}
   5171 
   5172 	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode);
   5173 
   5174 	fc_set_mid(ctx, fscp);
   5175 
   5176 	callstack_check_depth(ctx, FC_PUSH_VPM, 1);
   5177 	return 0;
   5178 }
   5179 
   5180 static int tgsi_umad(struct r600_shader_ctx *ctx)
   5181 {
   5182 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   5183 	struct r600_bytecode_alu alu;
   5184 	int i, j, r;
   5185 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   5186 
   5187 	/* src0 * src1 */
   5188 	for (i = 0; i < lasti + 1; i++) {
   5189 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   5190 			continue;
   5191 
   5192 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5193 
   5194 		alu.dst.chan = i;
   5195 		alu.dst.sel = ctx->temp_reg;
   5196 		alu.dst.write = 1;
   5197 
   5198 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT);
   5199 		for (j = 0; j < 2; j++) {
   5200 		        r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
   5201 		}
   5202 
   5203 		alu.last = 1;
   5204 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   5205 		if (r)
   5206 			return r;
   5207 	}
   5208 
   5209 
   5210 	for (i = 0; i < lasti + 1; i++) {
   5211 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   5212 			continue;
   5213 
   5214 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5215 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   5216 
   5217 		alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT);
   5218 
   5219 		alu.src[0].sel = ctx->temp_reg;
   5220 		alu.src[0].chan = i;
   5221 
   5222 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   5223 		if (i == lasti) {
   5224 			alu.last = 1;
   5225 		}
   5226 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   5227 		if (r)
   5228 			return r;
   5229 	}
   5230 	return 0;
   5231 }
   5232 
   5233 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
   5234 	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
   5235 	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
   5236 	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
   5237 
   5238 	/* XXX:
   5239 	 * For state trackers other than OpenGL, we'll want to use
   5240 	 * _RECIP_IEEE instead.
   5241 	 */
   5242 	{TGSI_OPCODE_RCP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
   5243 
   5244 	{TGSI_OPCODE_RSQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_rsq},
   5245 	{TGSI_OPCODE_EXP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
   5246 	{TGSI_OPCODE_LOG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
   5247 	{TGSI_OPCODE_MUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
   5248 	{TGSI_OPCODE_ADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
   5249 	{TGSI_OPCODE_DP3,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5250 	{TGSI_OPCODE_DP4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5251 	{TGSI_OPCODE_DST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
   5252 	{TGSI_OPCODE_MIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
   5253 	{TGSI_OPCODE_MAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
   5254 	{TGSI_OPCODE_SLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
   5255 	{TGSI_OPCODE_SGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
   5256 	{TGSI_OPCODE_MAD,	1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
   5257 	{TGSI_OPCODE_SUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
   5258 	{TGSI_OPCODE_LRP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
   5259 	{TGSI_OPCODE_CND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5260 	/* gap */
   5261 	{20,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5262 	{TGSI_OPCODE_DP2A,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5263 	/* gap */
   5264 	{22,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5265 	{23,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5266 	{TGSI_OPCODE_FRC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
   5267 	{TGSI_OPCODE_CLAMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5268 	{TGSI_OPCODE_FLR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
   5269 	{TGSI_OPCODE_ROUND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
   5270 	{TGSI_OPCODE_EX2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
   5271 	{TGSI_OPCODE_LG2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
   5272 	{TGSI_OPCODE_POW,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
   5273 	{TGSI_OPCODE_XPD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
   5274 	/* gap */
   5275 	{32,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5276 	{TGSI_OPCODE_ABS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
   5277 	{TGSI_OPCODE_RCC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5278 	{TGSI_OPCODE_DPH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5279 	{TGSI_OPCODE_COS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
   5280 	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
   5281 	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
   5282 	{TGSI_OPCODE_KILP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
   5283 	{TGSI_OPCODE_PK2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5284 	{TGSI_OPCODE_PK2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5285 	{TGSI_OPCODE_PK4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5286 	{TGSI_OPCODE_PK4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5287 	{TGSI_OPCODE_RFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5288 	{TGSI_OPCODE_SEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
   5289 	{TGSI_OPCODE_SFL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5290 	{TGSI_OPCODE_SGT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
   5291 	{TGSI_OPCODE_SIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
   5292 	{TGSI_OPCODE_SLE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
   5293 	{TGSI_OPCODE_SNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
   5294 	{TGSI_OPCODE_STR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5295 	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
   5296 	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
   5297 	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
   5298 	{TGSI_OPCODE_UP2H,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5299 	{TGSI_OPCODE_UP2US,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5300 	{TGSI_OPCODE_UP4B,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5301 	{TGSI_OPCODE_UP4UB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5302 	{TGSI_OPCODE_X2D,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5303 	{TGSI_OPCODE_ARA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5304 	{TGSI_OPCODE_ARR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
   5305 	{TGSI_OPCODE_BRA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5306 	{TGSI_OPCODE_CAL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5307 	{TGSI_OPCODE_RET,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5308 	{TGSI_OPCODE_SSG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
   5309 	{TGSI_OPCODE_CMP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
   5310 	{TGSI_OPCODE_SCS,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
   5311 	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
   5312 	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5313 	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5314 	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5315 	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
   5316 	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
   5317 	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
   5318 	/* gap */
   5319 	{75,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5320 	{76,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5321 	{TGSI_OPCODE_ELSE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
   5322 	{TGSI_OPCODE_ENDIF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
   5323 	/* gap */
   5324 	{79,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5325 	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5326 	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5327 	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5328 	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
   5329 	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
   5330 	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
   5331 	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
   5332 	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2_trans},
   5333 	/* gap */
   5334 	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5335 	{TGSI_OPCODE_AND,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
   5336 	{TGSI_OPCODE_OR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
   5337 	{TGSI_OPCODE_MOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
   5338 	{TGSI_OPCODE_XOR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
   5339 	{TGSI_OPCODE_SAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5340 	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
   5341 	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
   5342 	{TGSI_OPCODE_CONT,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
   5343 	{TGSI_OPCODE_EMIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5344 	{TGSI_OPCODE_ENDPRIM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5345 	{TGSI_OPCODE_BGNLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
   5346 	{TGSI_OPCODE_BGNSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5347 	{TGSI_OPCODE_ENDLOOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
   5348 	{TGSI_OPCODE_ENDSUB,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5349 	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
   5350 	/* gap */
   5351 	{104,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5352 	{105,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5353 	{106,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5354 	{TGSI_OPCODE_NOP,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5355 	/* gap */
   5356 	{108,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5357 	{109,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5358 	{110,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5359 	{111,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5360 	{TGSI_OPCODE_NRM4,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5361 	{TGSI_OPCODE_CALLNZ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5362 	{TGSI_OPCODE_IFC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5363 	{TGSI_OPCODE_BREAKC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5364 	{TGSI_OPCODE_KIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
   5365 	{TGSI_OPCODE_END,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
   5366 	/* gap */
   5367 	{118,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5368 	{TGSI_OPCODE_F2I,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2_trans},
   5369 	{TGSI_OPCODE_IDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
   5370 	{TGSI_OPCODE_IMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
   5371 	{TGSI_OPCODE_IMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
   5372 	{TGSI_OPCODE_INEG,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
   5373 	{TGSI_OPCODE_ISGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
   5374 	{TGSI_OPCODE_ISHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2_trans},
   5375 	{TGSI_OPCODE_ISLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
   5376 	{TGSI_OPCODE_F2U,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2_trans},
   5377 	{TGSI_OPCODE_U2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
   5378 	{TGSI_OPCODE_UADD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
   5379 	{TGSI_OPCODE_UDIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
   5380 	{TGSI_OPCODE_UMAD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
   5381 	{TGSI_OPCODE_UMAX,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
   5382 	{TGSI_OPCODE_UMIN,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
   5383 	{TGSI_OPCODE_UMOD,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
   5384 	{TGSI_OPCODE_UMUL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
   5385 	{TGSI_OPCODE_USEQ,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
   5386 	{TGSI_OPCODE_USGE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
   5387 	{TGSI_OPCODE_USHR,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2_trans},
   5388 	{TGSI_OPCODE_USLT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
   5389 	{TGSI_OPCODE_USNE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2_swap},
   5390 	{TGSI_OPCODE_SWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5391 	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5392 	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5393 	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5394 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
   5395 	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
   5396 	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
   5397 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
   5398 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
   5399 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
   5400 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
   5401 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
   5402 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
   5403 	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
   5404 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
   5405 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
   5406 	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
   5407 	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
   5408 	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
   5409 	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
   5410 	{TGSI_OPCODE_LAST,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5411 };
   5412 
   5413 static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
   5414 	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
   5415 	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
   5416 	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
   5417 	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
   5418 	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_rsq},
   5419 	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
   5420 	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
   5421 	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
   5422 	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
   5423 	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5424 	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5425 	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
   5426 	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
   5427 	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
   5428 	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
   5429 	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
   5430 	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
   5431 	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
   5432 	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
   5433 	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5434 	/* gap */
   5435 	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5436 	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5437 	/* gap */
   5438 	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5439 	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5440 	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
   5441 	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5442 	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
   5443 	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
   5444 	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate},
   5445 	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate},
   5446 	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow},
   5447 	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
   5448 	/* gap */
   5449 	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5450 	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
   5451 	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5452 	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5453 	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig},
   5454 	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
   5455 	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
   5456 	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
   5457 	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5458 	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5459 	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5460 	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5461 	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5462 	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
   5463 	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5464 	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
   5465 	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig},
   5466 	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
   5467 	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
   5468 	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5469 	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
   5470 	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
   5471 	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
   5472 	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5473 	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5474 	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5475 	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5476 	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5477 	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5478 	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
   5479 	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5480 	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5481 	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5482 	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
   5483 	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
   5484 	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
   5485 	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
   5486 	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5487 	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5488 	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5489 	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
   5490 	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
   5491 	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
   5492 	/* gap */
   5493 	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5494 	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5495 	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
   5496 	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
   5497 	/* gap */
   5498 	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5499 	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5500 	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5501 	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5502 	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
   5503 	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
   5504 	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
   5505 	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
   5506 	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
   5507 	/* gap */
   5508 	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5509 	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
   5510 	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
   5511 	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
   5512 	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
   5513 	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5514 	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
   5515 	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
   5516 	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
   5517 	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5518 	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5519 	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
   5520 	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5521 	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
   5522 	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5523 	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
   5524 	/* gap */
   5525 	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5526 	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5527 	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5528 	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5529 	/* gap */
   5530 	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5531 	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5532 	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5533 	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5534 	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5535 	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5536 	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5537 	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5538 	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
   5539 	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
   5540 	/* gap */
   5541 	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5542 	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_f2i},
   5543 	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
   5544 	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
   5545 	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
   5546 	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
   5547 	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
   5548 	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
   5549 	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
   5550 	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_f2i},
   5551 	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2_trans},
   5552 	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
   5553 	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
   5554 	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
   5555 	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
   5556 	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
   5557 	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
   5558 	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT, tgsi_op2_trans},
   5559 	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
   5560 	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
   5561 	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
   5562 	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
   5563 	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
   5564 	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5565 	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5566 	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5567 	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5568 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
   5569 	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
   5570 	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
   5571 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
   5572 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
   5573 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
   5574 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
   5575 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
   5576 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
   5577 	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
   5578 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
   5579 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
   5580 	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
   5581 	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
   5582 	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
   5583 	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
   5584 	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5585 };
   5586 
   5587 static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
   5588 	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
   5589 	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
   5590 	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
   5591 	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, cayman_emit_float_instr},
   5592 	{TGSI_OPCODE_RSQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, cayman_emit_float_instr},
   5593 	{TGSI_OPCODE_EXP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp},
   5594 	{TGSI_OPCODE_LOG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_log},
   5595 	{TGSI_OPCODE_MUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2},
   5596 	{TGSI_OPCODE_ADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
   5597 	{TGSI_OPCODE_DP3,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5598 	{TGSI_OPCODE_DP4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5599 	{TGSI_OPCODE_DST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst},
   5600 	{TGSI_OPCODE_MIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2},
   5601 	{TGSI_OPCODE_MAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2},
   5602 	{TGSI_OPCODE_SLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap},
   5603 	{TGSI_OPCODE_SGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2},
   5604 	{TGSI_OPCODE_MAD,	1, EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3},
   5605 	{TGSI_OPCODE_SUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2},
   5606 	{TGSI_OPCODE_LRP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp},
   5607 	{TGSI_OPCODE_CND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5608 	/* gap */
   5609 	{20,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5610 	{TGSI_OPCODE_DP2A,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5611 	/* gap */
   5612 	{22,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5613 	{23,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5614 	{TGSI_OPCODE_FRC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2},
   5615 	{TGSI_OPCODE_CLAMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5616 	{TGSI_OPCODE_FLR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2},
   5617 	{TGSI_OPCODE_ROUND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE, tgsi_op2},
   5618 	{TGSI_OPCODE_EX2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, cayman_emit_float_instr},
   5619 	{TGSI_OPCODE_LG2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, cayman_emit_float_instr},
   5620 	{TGSI_OPCODE_POW,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, cayman_pow},
   5621 	{TGSI_OPCODE_XPD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd},
   5622 	/* gap */
   5623 	{32,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5624 	{TGSI_OPCODE_ABS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
   5625 	{TGSI_OPCODE_RCC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5626 	{TGSI_OPCODE_DPH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5627 	{TGSI_OPCODE_COS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, cayman_trig},
   5628 	{TGSI_OPCODE_DDX,	0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex},
   5629 	{TGSI_OPCODE_DDY,	0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex},
   5630 	{TGSI_OPCODE_KILP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* predicated kill */
   5631 	{TGSI_OPCODE_PK2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5632 	{TGSI_OPCODE_PK2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5633 	{TGSI_OPCODE_PK4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5634 	{TGSI_OPCODE_PK4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5635 	{TGSI_OPCODE_RFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5636 	{TGSI_OPCODE_SEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2},
   5637 	{TGSI_OPCODE_SFL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5638 	{TGSI_OPCODE_SGT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2},
   5639 	{TGSI_OPCODE_SIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, cayman_trig},
   5640 	{TGSI_OPCODE_SLE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap},
   5641 	{TGSI_OPCODE_SNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2},
   5642 	{TGSI_OPCODE_STR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5643 	{TGSI_OPCODE_TEX,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
   5644 	{TGSI_OPCODE_TXD,	0, SQ_TEX_INST_SAMPLE_G, tgsi_tex},
   5645 	{TGSI_OPCODE_TXP,	0, SQ_TEX_INST_SAMPLE, tgsi_tex},
   5646 	{TGSI_OPCODE_UP2H,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5647 	{TGSI_OPCODE_UP2US,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5648 	{TGSI_OPCODE_UP4B,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5649 	{TGSI_OPCODE_UP4UB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5650 	{TGSI_OPCODE_X2D,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5651 	{TGSI_OPCODE_ARA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5652 	{TGSI_OPCODE_ARR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
   5653 	{TGSI_OPCODE_BRA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5654 	{TGSI_OPCODE_CAL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5655 	{TGSI_OPCODE_RET,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5656 	{TGSI_OPCODE_SSG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg},
   5657 	{TGSI_OPCODE_CMP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp},
   5658 	{TGSI_OPCODE_SCS,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs},
   5659 	{TGSI_OPCODE_TXB,	0, SQ_TEX_INST_SAMPLE_LB, tgsi_tex},
   5660 	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5661 	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5662 	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
   5663 	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
   5664 	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
   5665 	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
   5666 	/* gap */
   5667 	{75,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5668 	{76,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5669 	{TGSI_OPCODE_ELSE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else},
   5670 	{TGSI_OPCODE_ENDIF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif},
   5671 	/* gap */
   5672 	{79,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5673 	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5674 	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5675 	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5676 	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
   5677 	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
   5678 	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
   5679 	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
   5680 	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT, tgsi_op2},
   5681 	/* gap */
   5682 	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5683 	{TGSI_OPCODE_AND,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_AND_INT, tgsi_op2},
   5684 	{TGSI_OPCODE_OR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_OR_INT, tgsi_op2},
   5685 	{TGSI_OPCODE_MOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_imod},
   5686 	{TGSI_OPCODE_XOR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_XOR_INT, tgsi_op2},
   5687 	{TGSI_OPCODE_SAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5688 	{TGSI_OPCODE_TXF,	0, SQ_TEX_INST_LD, tgsi_tex},
   5689 	{TGSI_OPCODE_TXQ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
   5690 	{TGSI_OPCODE_CONT,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont},
   5691 	{TGSI_OPCODE_EMIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5692 	{TGSI_OPCODE_ENDPRIM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5693 	{TGSI_OPCODE_BGNLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop},
   5694 	{TGSI_OPCODE_BGNSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5695 	{TGSI_OPCODE_ENDLOOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop},
   5696 	{TGSI_OPCODE_ENDSUB,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5697 	{TGSI_OPCODE_TXQ_LZ,	0, SQ_TEX_INST_GET_TEXTURE_RESINFO, tgsi_tex},
   5698 	/* gap */
   5699 	{104,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5700 	{105,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5701 	{106,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5702 	{TGSI_OPCODE_NOP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5703 	/* gap */
   5704 	{108,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5705 	{109,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5706 	{110,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5707 	{111,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5708 	{TGSI_OPCODE_NRM4,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5709 	{TGSI_OPCODE_CALLNZ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5710 	{TGSI_OPCODE_IFC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5711 	{TGSI_OPCODE_BREAKC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5712 	{TGSI_OPCODE_KIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill},  /* conditional kill */
   5713 	{TGSI_OPCODE_END,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_end},  /* aka HALT */
   5714 	/* gap */
   5715 	{118,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5716 	{TGSI_OPCODE_F2I,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT, tgsi_op2},
   5717 	{TGSI_OPCODE_IDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_idiv},
   5718 	{TGSI_OPCODE_IMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_INT, tgsi_op2},
   5719 	{TGSI_OPCODE_IMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_INT, tgsi_op2},
   5720 	{TGSI_OPCODE_INEG,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SUB_INT, tgsi_ineg},
   5721 	{TGSI_OPCODE_ISGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT, tgsi_op2},
   5722 	{TGSI_OPCODE_ISHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT, tgsi_op2},
   5723 	{TGSI_OPCODE_ISLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_INT, tgsi_op2_swap},
   5724 	{TGSI_OPCODE_F2U,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT, tgsi_op2},
   5725 	{TGSI_OPCODE_U2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT, tgsi_op2},
   5726 	{TGSI_OPCODE_UADD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT, tgsi_op2},
   5727 	{TGSI_OPCODE_UDIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_udiv},
   5728 	{TGSI_OPCODE_UMAD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umad},
   5729 	{TGSI_OPCODE_UMAX,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX_UINT, tgsi_op2},
   5730 	{TGSI_OPCODE_UMIN,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN_UINT, tgsi_op2},
   5731 	{TGSI_OPCODE_UMOD,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_umod},
   5732 	{TGSI_OPCODE_UMUL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT, cayman_mul_int_instr},
   5733 	{TGSI_OPCODE_USEQ,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE_INT, tgsi_op2},
   5734 	{TGSI_OPCODE_USGE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT, tgsi_op2},
   5735 	{TGSI_OPCODE_USHR,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT, tgsi_op2},
   5736 	{TGSI_OPCODE_USLT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT_UINT, tgsi_op2_swap},
   5737 	{TGSI_OPCODE_USNE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE_INT, tgsi_op2},
   5738 	{TGSI_OPCODE_SWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5739 	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5740 	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5741 	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5742 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
   5743 	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
   5744 	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
   5745 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
   5746 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
   5747 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
   5748 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
   5749 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
   5750 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
   5751 	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
   5752 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
   5753 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
   5754 	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
   5755 	{TGSI_OPCODE_UCMP,      0, 0, tgsi_unsupported},
   5756 	{TGSI_OPCODE_IABS,      0, 0, tgsi_iabs},
   5757 	{TGSI_OPCODE_ISSG,      0, 0, tgsi_issg},
   5758 	{TGSI_OPCODE_LAST,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
   5759 };
   5760