Home | History | Annotate | Download | only in r600
      1 /*
      2  * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     22  */
     23 #include "r600_sq.h"
     24 #include "r600_formats.h"
     25 #include "r600_opcodes.h"
     26 #include "r600_shader.h"
     27 #include "r600d.h"
     28 
     29 #include "sb/sb_public.h"
     30 
     31 #include "pipe/p_shader_tokens.h"
     32 #include "tgsi/tgsi_info.h"
     33 #include "tgsi/tgsi_parse.h"
     34 #include "tgsi/tgsi_scan.h"
     35 #include "tgsi/tgsi_dump.h"
     36 #include "util/u_bitcast.h"
     37 #include "util/u_memory.h"
     38 #include "util/u_math.h"
     39 #include <stdio.h>
     40 #include <errno.h>
     41 
     42 /* CAYMAN notes
     43 Why CAYMAN got loops for lots of instructions is explained here.
     44 
     45 -These 8xx t-slot only ops are implemented in all vector slots.
     46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
     47 These 8xx t-slot only opcodes become vector ops, with all four
     48 slots expecting the arguments on sources a and b. Result is
     49 broadcast to all channels.
     50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
     51 These 8xx t-slot only opcodes become vector ops in the z, y, and
     52 x slots.
     53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
     54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
     55 SQRT_IEEE/_64
     56 SIN/COS
     57 The w slot may have an independent co-issued operation, or if the
     58 result is required to be in the w slot, the opcode above may be
     59 issued in the w slot as well.
     60 The compiler must issue the source argument to slots z, y, and x
     61 */
     62 
     63 /* Contents of r0 on entry to various shaders
     64 
     65  VS - .x = VertexID
     66       .y = RelVertexID (??)
     67       .w = InstanceID
     68 
     69  GS - r0.xyw, r1.xyz = per-vertex offsets
     70       r0.z = PrimitiveID
     71 
     72  TCS - .x = PatchID
     73        .y = RelPatchID (??)
     74        .z = InvocationID
     75        .w = tess factor base.
     76 
     77  TES - .x = TessCoord.x
     78      - .y = TessCoord.y
     79      - .z = RelPatchID (??)
     80      - .w = PrimitiveID
     81 
     82  PS - face_gpr.z = SampleMask
     83       face_gpr.w = SampleID
     84 */
     85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
     86 static int r600_shader_from_tgsi(struct r600_context *rctx,
     87 				 struct r600_pipe_shader *pipeshader,
     88 				 union r600_shader_key key);
     89 
     90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
     91                            int size, unsigned comp_mask) {
     92 
     93 	if (!size)
     94 		return;
     95 
     96 	if (ps->num_arrays == ps->max_arrays) {
     97 		ps->max_arrays += 64;
     98 		ps->arrays = realloc(ps->arrays, ps->max_arrays *
     99 		                     sizeof(struct r600_shader_array));
    100 	}
    101 
    102 	int n = ps->num_arrays;
    103 	++ps->num_arrays;
    104 
    105 	ps->arrays[n].comp_mask = comp_mask;
    106 	ps->arrays[n].gpr_start = start_gpr;
    107 	ps->arrays[n].gpr_count = size;
    108 }
    109 
    110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
    111 {
    112 	unsigned i;
    113 
    114 	fprintf(stderr, "STREAMOUT\n");
    115 	for (i = 0; i < so->num_outputs; i++) {
    116 		unsigned mask = ((1 << so->output[i].num_components) - 1) <<
    117 				so->output[i].start_component;
    118 		fprintf(stderr, "  %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
    119 			i,
    120 			so->output[i].stream,
    121 			so->output[i].output_buffer,
    122 			so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
    123 			so->output[i].register_index,
    124 			mask & 1 ? "x" : "",
    125 		        mask & 2 ? "y" : "",
    126 		        mask & 4 ? "z" : "",
    127 		        mask & 8 ? "w" : "",
    128 			so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
    129 	}
    130 }
    131 
    132 static int store_shader(struct pipe_context *ctx,
    133 			struct r600_pipe_shader *shader)
    134 {
    135 	struct r600_context *rctx = (struct r600_context *)ctx;
    136 	uint32_t *ptr, i;
    137 
    138 	if (shader->bo == NULL) {
    139 		shader->bo = (struct r600_resource*)
    140 			pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
    141 		if (shader->bo == NULL) {
    142 			return -ENOMEM;
    143 		}
    144 		ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
    145 		if (R600_BIG_ENDIAN) {
    146 			for (i = 0; i < shader->shader.bc.ndw; ++i) {
    147 				ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
    148 			}
    149 		} else {
    150 			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
    151 		}
    152 		rctx->b.ws->buffer_unmap(shader->bo->buf);
    153 	}
    154 
    155 	return 0;
    156 }
    157 
    158 int r600_pipe_shader_create(struct pipe_context *ctx,
    159 			    struct r600_pipe_shader *shader,
    160 			    union r600_shader_key key)
    161 {
    162 	struct r600_context *rctx = (struct r600_context *)ctx;
    163 	struct r600_pipe_shader_selector *sel = shader->selector;
    164 	int r;
    165 	bool dump = r600_can_dump_shader(&rctx->screen->b,
    166 					 tgsi_get_processor_type(sel->tokens));
    167 	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
    168 	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
    169 	unsigned export_shader;
    170 
    171 	shader->shader.bc.isa = rctx->isa;
    172 
    173 	if (dump) {
    174 		fprintf(stderr, "--------------------------------------------------------------\n");
    175 		tgsi_dump(sel->tokens, 0);
    176 
    177 		if (sel->so.num_outputs) {
    178 			r600_dump_streamout(&sel->so);
    179 		}
    180 	}
    181 	r = r600_shader_from_tgsi(rctx, shader, key);
    182 	if (r) {
    183 		R600_ERR("translation from TGSI failed !\n");
    184 		goto error;
    185 	}
    186 	if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
    187 		/* only disable for vertex shaders in tess paths */
    188 		if (key.vs.as_ls)
    189 			use_sb = 0;
    190 	}
    191 	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
    192 	use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
    193 
    194 	/* disable SB for shaders using doubles */
    195 	use_sb &= !shader->shader.uses_doubles;
    196 
    197 	/* Check if the bytecode has already been built. */
    198 	if (!shader->shader.bc.bytecode) {
    199 		r = r600_bytecode_build(&shader->shader.bc);
    200 		if (r) {
    201 			R600_ERR("building bytecode failed !\n");
    202 			goto error;
    203 		}
    204 	}
    205 
    206 	if (dump && !sb_disasm) {
    207 		fprintf(stderr, "--------------------------------------------------------------\n");
    208 		r600_bytecode_disasm(&shader->shader.bc);
    209 		fprintf(stderr, "______________________________________________________________\n");
    210 	} else if ((dump && sb_disasm) || use_sb) {
    211 		r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
    212 		                             dump, use_sb);
    213 		if (r) {
    214 			R600_ERR("r600_sb_bytecode_process failed !\n");
    215 			goto error;
    216 		}
    217 	}
    218 
    219 	if (shader->gs_copy_shader) {
    220 		if (dump) {
    221 			// dump copy shader
    222 			r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
    223 						     &shader->gs_copy_shader->shader, dump, 0);
    224 			if (r)
    225 				goto error;
    226 		}
    227 
    228 		if ((r = store_shader(ctx, shader->gs_copy_shader)))
    229 			goto error;
    230 	}
    231 
    232 	/* Store the shader in a buffer. */
    233 	if ((r = store_shader(ctx, shader)))
    234 		goto error;
    235 
    236 	/* Build state. */
    237 	switch (shader->shader.processor_type) {
    238 	case PIPE_SHADER_TESS_CTRL:
    239 		evergreen_update_hs_state(ctx, shader);
    240 		break;
    241 	case PIPE_SHADER_TESS_EVAL:
    242 		if (key.tes.as_es)
    243 			evergreen_update_es_state(ctx, shader);
    244 		else
    245 			evergreen_update_vs_state(ctx, shader);
    246 		break;
    247 	case PIPE_SHADER_GEOMETRY:
    248 		if (rctx->b.chip_class >= EVERGREEN) {
    249 			evergreen_update_gs_state(ctx, shader);
    250 			evergreen_update_vs_state(ctx, shader->gs_copy_shader);
    251 		} else {
    252 			r600_update_gs_state(ctx, shader);
    253 			r600_update_vs_state(ctx, shader->gs_copy_shader);
    254 		}
    255 		break;
    256 	case PIPE_SHADER_VERTEX:
    257 		export_shader = key.vs.as_es;
    258 		if (rctx->b.chip_class >= EVERGREEN) {
    259 			if (key.vs.as_ls)
    260 				evergreen_update_ls_state(ctx, shader);
    261 			else if (key.vs.as_es)
    262 				evergreen_update_es_state(ctx, shader);
    263 			else
    264 				evergreen_update_vs_state(ctx, shader);
    265 		} else {
    266 			if (export_shader)
    267 				r600_update_es_state(ctx, shader);
    268 			else
    269 				r600_update_vs_state(ctx, shader);
    270 		}
    271 		break;
    272 	case PIPE_SHADER_FRAGMENT:
    273 		if (rctx->b.chip_class >= EVERGREEN) {
    274 			evergreen_update_ps_state(ctx, shader);
    275 		} else {
    276 			r600_update_ps_state(ctx, shader);
    277 		}
    278 		break;
    279 	default:
    280 		r = -EINVAL;
    281 		goto error;
    282 	}
    283 	return 0;
    284 
    285 error:
    286 	r600_pipe_shader_destroy(ctx, shader);
    287 	return r;
    288 }
    289 
    290 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
    291 {
    292 	r600_resource_reference(&shader->bo, NULL);
    293 	r600_bytecode_clear(&shader->shader.bc);
    294 	r600_release_command_buffer(&shader->command_buffer);
    295 }
    296 
    297 /*
    298  * tgsi -> r600 shader
    299  */
    300 struct r600_shader_tgsi_instruction;
    301 
    302 struct r600_shader_src {
    303 	unsigned				sel;
    304 	unsigned				swizzle[4];
    305 	unsigned				neg;
    306 	unsigned				abs;
    307 	unsigned				rel;
    308 	unsigned				kc_bank;
    309 	boolean					kc_rel; /* true if cache bank is indexed */
    310 	uint32_t				value[4];
    311 };
    312 
    313 struct eg_interp {
    314 	boolean					enabled;
    315 	unsigned				ij_index;
    316 };
    317 
    318 struct r600_shader_ctx {
    319 	struct tgsi_shader_info			info;
    320 	struct tgsi_parse_context		parse;
    321 	const struct tgsi_token			*tokens;
    322 	unsigned				type;
    323 	unsigned				file_offset[TGSI_FILE_COUNT];
    324 	unsigned				temp_reg;
    325 	const struct r600_shader_tgsi_instruction	*inst_info;
    326 	struct r600_bytecode			*bc;
    327 	struct r600_shader			*shader;
    328 	struct r600_shader_src			src[4];
    329 	uint32_t				*literals;
    330 	uint32_t				nliterals;
    331 	uint32_t				max_driver_temp_used;
    332 	/* needed for evergreen interpolation */
    333 	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
    334 	/* evergreen/cayman also store sample mask in face register */
    335 	int					face_gpr;
    336 	/* sample id is .w component stored in fixed point position register */
    337 	int					fixed_pt_position_gpr;
    338 	int					colors_used;
    339 	boolean                 clip_vertex_write;
    340 	unsigned                cv_output;
    341 	unsigned		edgeflag_output;
    342 	int					fragcoord_input;
    343 	int					native_integers;
    344 	int					next_ring_offset;
    345 	int					gs_out_ring_offset;
    346 	int					gs_next_vertex;
    347 	struct r600_shader	*gs_for_vs;
    348 	int					gs_export_gpr_tregs[4];
    349 	const struct pipe_stream_output_info	*gs_stream_output_info;
    350 	unsigned				enabled_stream_buffers_mask;
    351 	unsigned                                tess_input_info; /* temp with tess input offsets */
    352 	unsigned                                tess_output_info; /* temp with tess input offsets */
    353 };
    354 
    355 struct r600_shader_tgsi_instruction {
    356 	unsigned	op;
    357 	int (*process)(struct r600_shader_ctx *ctx);
    358 };
    359 
    360 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
    361 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
    362 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
    363 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
    364 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
    365 static int tgsi_else(struct r600_shader_ctx *ctx);
    366 static int tgsi_endif(struct r600_shader_ctx *ctx);
    367 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
    368 static int tgsi_endloop(struct r600_shader_ctx *ctx);
    369 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
    370 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
    371                                 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
    372                                 unsigned int dst_reg);
    373 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
    374 			const struct r600_shader_src *shader_src,
    375 			unsigned chan);
    376 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
    377 			       unsigned dst_reg);
    378 
    379 static int tgsi_last_instruction(unsigned writemask)
    380 {
    381 	int i, lasti = 0;
    382 
    383 	for (i = 0; i < 4; i++) {
    384 		if (writemask & (1 << i)) {
    385 			lasti = i;
    386 		}
    387 	}
    388 	return lasti;
    389 }
    390 
    391 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
    392 {
    393 	struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
    394 	unsigned j;
    395 
    396 	if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
    397 		R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
    398 		return -EINVAL;
    399 	}
    400 	if (i->Instruction.Predicate) {
    401 		R600_ERR("predicate unsupported\n");
    402 		return -EINVAL;
    403 	}
    404 #if 0
    405 	if (i->Instruction.Label) {
    406 		R600_ERR("label unsupported\n");
    407 		return -EINVAL;
    408 	}
    409 #endif
    410 	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
    411 		if (i->Src[j].Register.Dimension) {
    412 		   switch (i->Src[j].Register.File) {
    413 		   case TGSI_FILE_CONSTANT:
    414 			   break;
    415 		   case TGSI_FILE_INPUT:
    416 			   if (ctx->type == PIPE_SHADER_GEOMETRY ||
    417 			       ctx->type == PIPE_SHADER_TESS_CTRL ||
    418 			       ctx->type == PIPE_SHADER_TESS_EVAL)
    419 				   break;
    420 		   case TGSI_FILE_OUTPUT:
    421 			   if (ctx->type == PIPE_SHADER_TESS_CTRL)
    422 				   break;
    423 		   default:
    424 			   R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
    425 				    i->Src[j].Register.File,
    426 				    i->Src[j].Register.Dimension);
    427 			   return -EINVAL;
    428 		   }
    429 		}
    430 	}
    431 	for (j = 0; j < i->Instruction.NumDstRegs; j++) {
    432 		if (i->Dst[j].Register.Dimension) {
    433 			if (ctx->type == PIPE_SHADER_TESS_CTRL)
    434 				continue;
    435 			R600_ERR("unsupported dst (dimension)\n");
    436 			return -EINVAL;
    437 		}
    438 	}
    439 	return 0;
    440 }
    441 
    442 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
    443 {
    444 	if (interpolate == TGSI_INTERPOLATE_COLOR ||
    445 		interpolate == TGSI_INTERPOLATE_LINEAR ||
    446 		interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
    447 	{
    448 		int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
    449 		int loc;
    450 
    451 		switch(location) {
    452 		case TGSI_INTERPOLATE_LOC_CENTER:
    453 			loc = 1;
    454 			break;
    455 		case TGSI_INTERPOLATE_LOC_CENTROID:
    456 			loc = 2;
    457 			break;
    458 		case TGSI_INTERPOLATE_LOC_SAMPLE:
    459 		default:
    460 			loc = 0; break;
    461 		}
    462 
    463 		return is_linear * 3 + loc;
    464 	}
    465 
    466 	return -1;
    467 }
    468 
    469 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
    470 		int input)
    471 {
    472 	int i = eg_get_interpolator_index(
    473 		ctx->shader->input[input].interpolate,
    474 		ctx->shader->input[input].interpolate_location);
    475 	assert(i >= 0);
    476 	ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
    477 }
    478 
    479 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
    480 {
    481 	int i, r;
    482 	struct r600_bytecode_alu alu;
    483 	int gpr = 0, base_chan = 0;
    484 	int ij_index = ctx->shader->input[input].ij_index;
    485 
    486 	/* work out gpr and base_chan from index */
    487 	gpr = ij_index / 2;
    488 	base_chan = (2 * (ij_index % 2)) + 1;
    489 
    490 	for (i = 0; i < 8; i++) {
    491 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    492 
    493 		if (i < 4)
    494 			alu.op = ALU_OP2_INTERP_ZW;
    495 		else
    496 			alu.op = ALU_OP2_INTERP_XY;
    497 
    498 		if ((i > 1) && (i < 6)) {
    499 			alu.dst.sel = ctx->shader->input[input].gpr;
    500 			alu.dst.write = 1;
    501 		}
    502 
    503 		alu.dst.chan = i % 4;
    504 
    505 		alu.src[0].sel = gpr;
    506 		alu.src[0].chan = (base_chan - (i % 2));
    507 
    508 		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
    509 
    510 		alu.bank_swizzle_force = SQ_ALU_VEC_210;
    511 		if ((i % 4) == 3)
    512 			alu.last = 1;
    513 		r = r600_bytecode_add_alu(ctx->bc, &alu);
    514 		if (r)
    515 			return r;
    516 	}
    517 	return 0;
    518 }
    519 
    520 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
    521 {
    522 	int i, r;
    523 	struct r600_bytecode_alu alu;
    524 
    525 	for (i = 0; i < 4; i++) {
    526 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    527 
    528 		alu.op = ALU_OP1_INTERP_LOAD_P0;
    529 
    530 		alu.dst.sel = ctx->shader->input[input].gpr;
    531 		alu.dst.write = 1;
    532 
    533 		alu.dst.chan = i;
    534 
    535 		alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
    536 		alu.src[0].chan = i;
    537 
    538 		if (i == 3)
    539 			alu.last = 1;
    540 		r = r600_bytecode_add_alu(ctx->bc, &alu);
    541 		if (r)
    542 			return r;
    543 	}
    544 	return 0;
    545 }
    546 
    547 /*
    548  * Special export handling in shaders
    549  *
    550  * shader export ARRAY_BASE for EXPORT_POS:
    551  * 60 is position
    552  * 61 is misc vector
    553  * 62, 63 are clip distance vectors
    554  *
    555  * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
    556  * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
    557  * USE_VTX_POINT_SIZE - point size in the X channel of export 61
    558  * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
    559  * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
    560  * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
    561  * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
    562  * exclusive from render target index)
    563  * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
    564  *
    565  *
    566  * shader export ARRAY_BASE for EXPORT_PIXEL:
    567  * 0-7 CB targets
    568  * 61 computed Z vector
    569  *
    570  * The use of the values exported in the computed Z vector are controlled
    571  * by DB_SHADER_CONTROL:
    572  * Z_EXPORT_ENABLE - Z as a float in RED
    573  * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
    574  * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
    575  * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
    576  * DB_SOURCE_FORMAT - export control restrictions
    577  *
    578  */
    579 
    580 
    581 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
    582 static int r600_spi_sid(struct r600_shader_io * io)
    583 {
    584 	int index, name = io->name;
    585 
    586 	/* These params are handled differently, they don't need
    587 	 * semantic indices, so we'll use 0 for them.
    588 	 */
    589 	if (name == TGSI_SEMANTIC_POSITION ||
    590 	    name == TGSI_SEMANTIC_PSIZE ||
    591 	    name == TGSI_SEMANTIC_EDGEFLAG ||
    592 	    name == TGSI_SEMANTIC_FACE ||
    593 	    name == TGSI_SEMANTIC_SAMPLEMASK)
    594 		index = 0;
    595 	else {
    596 		if (name == TGSI_SEMANTIC_GENERIC) {
    597 			/* For generic params simply use sid from tgsi */
    598 			index = io->sid;
    599 		} else {
    600 			/* For non-generic params - pack name and sid into 8 bits */
    601 			index = 0x80 | (name<<3) | (io->sid);
    602 		}
    603 
    604 		/* Make sure that all really used indices have nonzero value, so
    605 		 * we can just compare it to 0 later instead of comparing the name
    606 		 * with different values to detect special cases. */
    607 		index++;
    608 	}
    609 
    610 	return index;
    611 };
    612 
    613 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
    614 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
    615 {
    616 	switch (semantic_name) {
    617 	case TGSI_SEMANTIC_POSITION:
    618 		return 0;
    619 	case TGSI_SEMANTIC_PSIZE:
    620 		return 1;
    621 	case TGSI_SEMANTIC_CLIPDIST:
    622 		assert(index <= 1);
    623 		return 2 + index;
    624 	case TGSI_SEMANTIC_GENERIC:
    625 		if (index <= 63-4)
    626 			return 4 + index - 9;
    627 		else
    628 			/* same explanation as in the default statement,
    629 			 * the only user hitting this is st/nine.
    630 			 */
    631 			return 0;
    632 
    633 	/* patch indices are completely separate and thus start from 0 */
    634 	case TGSI_SEMANTIC_TESSOUTER:
    635 		return 0;
    636 	case TGSI_SEMANTIC_TESSINNER:
    637 		return 1;
    638 	case TGSI_SEMANTIC_PATCH:
    639 		return 2 + index;
    640 
    641 	default:
    642 		/* Don't fail here. The result of this function is only used
    643 		 * for LS, TCS, TES, and GS, where legacy GL semantics can't
    644 		 * occur, but this function is called for all vertex shaders
    645 		 * before it's known whether LS will be compiled or not.
    646 		 */
    647 		return 0;
    648 	}
    649 }
    650 
    651 /* turn input into interpolate on EG */
    652 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
    653 {
    654 	int r = 0;
    655 
    656 	if (ctx->shader->input[index].spi_sid) {
    657 		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
    658 		if (ctx->shader->input[index].interpolate > 0) {
    659 			evergreen_interp_assign_ij_index(ctx, index);
    660 			r = evergreen_interp_alu(ctx, index);
    661 		} else {
    662 			r = evergreen_interp_flat(ctx, index);
    663 		}
    664 	}
    665 	return r;
    666 }
    667 
    668 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
    669 {
    670 	struct r600_bytecode_alu alu;
    671 	int i, r;
    672 	int gpr_front = ctx->shader->input[front].gpr;
    673 	int gpr_back = ctx->shader->input[back].gpr;
    674 
    675 	for (i = 0; i < 4; i++) {
    676 		memset(&alu, 0, sizeof(alu));
    677 		alu.op = ALU_OP3_CNDGT;
    678 		alu.is_op3 = 1;
    679 		alu.dst.write = 1;
    680 		alu.dst.sel = gpr_front;
    681 		alu.src[0].sel = ctx->face_gpr;
    682 		alu.src[1].sel = gpr_front;
    683 		alu.src[2].sel = gpr_back;
    684 
    685 		alu.dst.chan = i;
    686 		alu.src[1].chan = i;
    687 		alu.src[2].chan = i;
    688 		alu.last = (i==3);
    689 
    690 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
    691 			return r;
    692 	}
    693 
    694 	return 0;
    695 }
    696 
    697 /* execute a single slot ALU calculation */
    698 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
    699 			  int dst_sel, int dst_chan,
    700 			  int src0_sel, unsigned src0_chan_val,
    701 			  int src1_sel, unsigned src1_chan_val)
    702 {
    703 	struct r600_bytecode_alu alu;
    704 	int r, i;
    705 
    706 	if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
    707 		for (i = 0; i < 4; i++) {
    708 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    709 			alu.op = op;
    710 			alu.src[0].sel = src0_sel;
    711 			if (src0_sel == V_SQ_ALU_SRC_LITERAL)
    712 				alu.src[0].value = src0_chan_val;
    713 			else
    714 				alu.src[0].chan = src0_chan_val;
    715 			alu.src[1].sel = src1_sel;
    716 			if (src1_sel == V_SQ_ALU_SRC_LITERAL)
    717 				alu.src[1].value = src1_chan_val;
    718 			else
    719 				alu.src[1].chan = src1_chan_val;
    720 			alu.dst.sel = dst_sel;
    721 			alu.dst.chan = i;
    722 			alu.dst.write = i == dst_chan;
    723 			alu.last = (i == 3);
    724 			r = r600_bytecode_add_alu(ctx->bc, &alu);
    725 			if (r)
    726 				return r;
    727 		}
    728 		return 0;
    729 	}
    730 
    731 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    732 	alu.op = op;
    733 	alu.src[0].sel = src0_sel;
    734 	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
    735 		alu.src[0].value = src0_chan_val;
    736 	else
    737 		alu.src[0].chan = src0_chan_val;
    738 	alu.src[1].sel = src1_sel;
    739 	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
    740 		alu.src[1].value = src1_chan_val;
    741 	else
    742 		alu.src[1].chan = src1_chan_val;
    743 	alu.dst.sel = dst_sel;
    744 	alu.dst.chan = dst_chan;
    745 	alu.dst.write = 1;
    746 	alu.last = 1;
    747 	r = r600_bytecode_add_alu(ctx->bc, &alu);
    748 	if (r)
    749 		return r;
    750 	return 0;
    751 }
    752 
    753 /* execute a single slot ALU calculation */
    754 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
    755 			  int dst_sel, int dst_chan,
    756 			  int src0_sel, unsigned src0_chan_val,
    757 			  int src1_sel, unsigned src1_chan_val,
    758 			  int src2_sel, unsigned src2_chan_val)
    759 {
    760 	struct r600_bytecode_alu alu;
    761 	int r;
    762 
    763 	/* validate this for other ops */
    764 	assert(op == ALU_OP3_MULADD_UINT24);
    765 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    766 	alu.op = op;
    767 	alu.src[0].sel = src0_sel;
    768 	if (src0_sel == V_SQ_ALU_SRC_LITERAL)
    769 		alu.src[0].value = src0_chan_val;
    770 	else
    771 		alu.src[0].chan = src0_chan_val;
    772 	alu.src[1].sel = src1_sel;
    773 	if (src1_sel == V_SQ_ALU_SRC_LITERAL)
    774 		alu.src[1].value = src1_chan_val;
    775 	else
    776 		alu.src[1].chan = src1_chan_val;
    777 	alu.src[2].sel = src2_sel;
    778 	if (src2_sel == V_SQ_ALU_SRC_LITERAL)
    779 		alu.src[2].value = src2_chan_val;
    780 	else
    781 		alu.src[2].chan = src2_chan_val;
    782 	alu.dst.sel = dst_sel;
    783 	alu.dst.chan = dst_chan;
    784 	alu.is_op3 = 1;
    785 	alu.last = 1;
    786 	r = r600_bytecode_add_alu(ctx->bc, &alu);
    787 	if (r)
    788 		return r;
    789 	return 0;
    790 }
    791 
    792 /* put it in temp_reg.x */
    793 static int get_lds_offset0(struct r600_shader_ctx *ctx,
    794 			   int rel_patch_chan,
    795 			   int temp_reg, bool is_patch_var)
    796 {
    797 	int r;
    798 
    799 	/* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
    800 	/* ADD
    801 	   Dimension - patch0_offset (input_vals.z),
    802 	   Non-dim - patch0_data_offset (input_vals.w)
    803 	*/
    804 	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
    805 			   temp_reg, 0,
    806 			   ctx->tess_output_info, 0,
    807 			   0, rel_patch_chan,
    808 			   ctx->tess_output_info, is_patch_var ? 3 : 2);
    809 	if (r)
    810 		return r;
    811 	return 0;
    812 }
    813 
    814 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
    815 {
    816 	return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
    817 }
    818 
    819 static int r600_get_temp(struct r600_shader_ctx *ctx)
    820 {
    821 	return ctx->temp_reg + ctx->max_driver_temp_used++;
    822 }
    823 
    824 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
    825 {
    826 	int i;
    827 	i = ctx->shader->noutput++;
    828 	ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
    829 	ctx->shader->output[i].sid = 0;
    830 	ctx->shader->output[i].gpr = 0;
    831 	ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
    832 	ctx->shader->output[i].write_mask = 0x4;
    833 	ctx->shader->output[i].spi_sid = prim_id_sid;
    834 
    835 	return 0;
    836 }
    837 
    838 static int tgsi_barrier(struct r600_shader_ctx *ctx)
    839 {
    840 	struct r600_bytecode_alu alu;
    841 	int r;
    842 
    843 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    844 	alu.op = ctx->inst_info->op;
    845 	alu.last = 1;
    846 
    847 	r = r600_bytecode_add_alu(ctx->bc, &alu);
    848 	if (r)
    849 		return r;
    850 	return 0;
    851 }
    852 
    853 static int tgsi_declaration(struct r600_shader_ctx *ctx)
    854 {
    855 	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
    856 	int r, i, j, count = d->Range.Last - d->Range.First + 1;
    857 
    858 	switch (d->Declaration.File) {
    859 	case TGSI_FILE_INPUT:
    860 		for (j = 0; j < count; j++) {
    861 			i = ctx->shader->ninput + j;
    862 			assert(i < ARRAY_SIZE(ctx->shader->input));
    863 			ctx->shader->input[i].name = d->Semantic.Name;
    864 			ctx->shader->input[i].sid = d->Semantic.Index + j;
    865 			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
    866 			ctx->shader->input[i].interpolate_location = d->Interp.Location;
    867 			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
    868 			if (ctx->type == PIPE_SHADER_FRAGMENT) {
    869 				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
    870 				switch (ctx->shader->input[i].name) {
    871 				case TGSI_SEMANTIC_FACE:
    872 					if (ctx->face_gpr != -1)
    873 						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
    874 					else
    875 						ctx->face_gpr = ctx->shader->input[i].gpr;
    876 					break;
    877 				case TGSI_SEMANTIC_COLOR:
    878 					ctx->colors_used++;
    879 					break;
    880 				case TGSI_SEMANTIC_POSITION:
    881 					ctx->fragcoord_input = i;
    882 					break;
    883 				case TGSI_SEMANTIC_PRIMID:
    884 					/* set this for now */
    885 					ctx->shader->gs_prim_id_input = true;
    886 					ctx->shader->ps_prim_id_input = i;
    887 					break;
    888 				}
    889 				if (ctx->bc->chip_class >= EVERGREEN) {
    890 					if ((r = evergreen_interp_input(ctx, i)))
    891 						return r;
    892 				}
    893 			} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
    894 				/* FIXME probably skip inputs if they aren't passed in the ring */
    895 				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
    896 				ctx->next_ring_offset += 16;
    897 				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
    898 					ctx->shader->gs_prim_id_input = true;
    899 			}
    900 		}
    901 		ctx->shader->ninput += count;
    902 		break;
    903 	case TGSI_FILE_OUTPUT:
    904 		for (j = 0; j < count; j++) {
    905 			i = ctx->shader->noutput + j;
    906 			assert(i < ARRAY_SIZE(ctx->shader->output));
    907 			ctx->shader->output[i].name = d->Semantic.Name;
    908 			ctx->shader->output[i].sid = d->Semantic.Index + j;
    909 			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
    910 			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
    911 			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
    912 			if (ctx->type == PIPE_SHADER_VERTEX ||
    913 			    ctx->type == PIPE_SHADER_GEOMETRY ||
    914 			    ctx->type == PIPE_SHADER_TESS_EVAL) {
    915 				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
    916 				switch (d->Semantic.Name) {
    917 				case TGSI_SEMANTIC_CLIPDIST:
    918 					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
    919 									((d->Semantic.Index + j) << 2);
    920 					break;
    921 				case TGSI_SEMANTIC_PSIZE:
    922 					ctx->shader->vs_out_misc_write = 1;
    923 					ctx->shader->vs_out_point_size = 1;
    924 					break;
    925 				case TGSI_SEMANTIC_EDGEFLAG:
    926 					ctx->shader->vs_out_misc_write = 1;
    927 					ctx->shader->vs_out_edgeflag = 1;
    928 					ctx->edgeflag_output = i;
    929 					break;
    930 				case TGSI_SEMANTIC_VIEWPORT_INDEX:
    931 					ctx->shader->vs_out_misc_write = 1;
    932 					ctx->shader->vs_out_viewport = 1;
    933 					break;
    934 				case TGSI_SEMANTIC_LAYER:
    935 					ctx->shader->vs_out_misc_write = 1;
    936 					ctx->shader->vs_out_layer = 1;
    937 					break;
    938 				case TGSI_SEMANTIC_CLIPVERTEX:
    939 					ctx->clip_vertex_write = TRUE;
    940 					ctx->cv_output = i;
    941 					break;
    942 				}
    943 				if (ctx->type == PIPE_SHADER_GEOMETRY) {
    944 					ctx->gs_out_ring_offset += 16;
    945 				}
    946 			} else if (ctx->type == PIPE_SHADER_FRAGMENT) {
    947 				switch (d->Semantic.Name) {
    948 				case TGSI_SEMANTIC_COLOR:
    949 					ctx->shader->nr_ps_max_color_exports++;
    950 					break;
    951 				}
    952 			}
    953 		}
    954 		ctx->shader->noutput += count;
    955 		break;
    956 	case TGSI_FILE_TEMPORARY:
    957 		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
    958 			if (d->Array.ArrayID) {
    959 				r600_add_gpr_array(ctx->shader,
    960 				               ctx->file_offset[TGSI_FILE_TEMPORARY] +
    961 								   d->Range.First,
    962 				               d->Range.Last - d->Range.First + 1, 0x0F);
    963 			}
    964 		}
    965 		break;
    966 
    967 	case TGSI_FILE_CONSTANT:
    968 	case TGSI_FILE_SAMPLER:
    969 	case TGSI_FILE_SAMPLER_VIEW:
    970 	case TGSI_FILE_ADDRESS:
    971 		break;
    972 
    973 	case TGSI_FILE_SYSTEM_VALUE:
    974 		if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
    975 			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
    976 			d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
    977 			break; /* Already handled from allocate_system_value_inputs */
    978 		} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
    979 			if (!ctx->native_integers) {
    980 				struct r600_bytecode_alu alu;
    981 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
    982 
    983 				alu.op = ALU_OP1_INT_TO_FLT;
    984 				alu.src[0].sel = 0;
    985 				alu.src[0].chan = 3;
    986 
    987 				alu.dst.sel = 0;
    988 				alu.dst.chan = 3;
    989 				alu.dst.write = 1;
    990 				alu.last = 1;
    991 
    992 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
    993 					return r;
    994 			}
    995 			break;
    996 		} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
    997 			break;
    998 		else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
    999 			break;
   1000 		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
   1001 			 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
   1002 			int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
   1003 			int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
   1004 			unsigned temp_reg = r600_get_temp(ctx);
   1005 
   1006 			r = get_lds_offset0(ctx, 2, temp_reg, true);
   1007 			if (r)
   1008 				return r;
   1009 
   1010 			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   1011 					   temp_reg, 0,
   1012 					   temp_reg, 0,
   1013 					   V_SQ_ALU_SRC_LITERAL, param * 16);
   1014 			if (r)
   1015 				return r;
   1016 
   1017 			do_lds_fetch_values(ctx, temp_reg, dreg);
   1018 		}
   1019 		else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
   1020 			/* MOV r1.x, r0.x;
   1021 			   MOV r1.y, r0.y;
   1022 			*/
   1023 			for (i = 0; i < 2; i++) {
   1024 				struct r600_bytecode_alu alu;
   1025 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1026 				alu.op = ALU_OP1_MOV;
   1027 				alu.src[0].sel = 0;
   1028 				alu.src[0].chan = 0 + i;
   1029 				alu.dst.sel = 1;
   1030 				alu.dst.chan = 0 + i;
   1031 				alu.dst.write = 1;
   1032 				alu.last = (i == 1) ? 1 : 0;
   1033 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   1034 					return r;
   1035 			}
   1036 			/* ADD r1.z, 1.0f, -r0.x */
   1037 			struct r600_bytecode_alu alu;
   1038 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1039 			alu.op = ALU_OP2_ADD;
   1040 			alu.src[0].sel = V_SQ_ALU_SRC_1;
   1041 			alu.src[1].sel = 1;
   1042 			alu.src[1].chan = 0;
   1043 			alu.src[1].neg = 1;
   1044 			alu.dst.sel = 1;
   1045 			alu.dst.chan = 2;
   1046 			alu.dst.write = 1;
   1047 			alu.last = 1;
   1048 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   1049 				return r;
   1050 
   1051 			/* ADD r1.z, r1.z, -r1.y */
   1052 			alu.op = ALU_OP2_ADD;
   1053 			alu.src[0].sel = 1;
   1054 			alu.src[0].chan = 2;
   1055 			alu.src[1].sel = 1;
   1056 			alu.src[1].chan = 1;
   1057 			alu.src[1].neg = 1;
   1058 			alu.dst.sel = 1;
   1059 			alu.dst.chan = 2;
   1060 			alu.dst.write = 1;
   1061 			alu.last = 1;
   1062 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   1063 				return r;
   1064 			break;
   1065 		}
   1066 		break;
   1067 	default:
   1068 		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
   1069 		return -EINVAL;
   1070 	}
   1071 	return 0;
   1072 }
   1073 
   1074 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
   1075 {
   1076 	struct tgsi_parse_context parse;
   1077 	struct {
   1078 		boolean enabled;
   1079 		int *reg;
   1080 		unsigned name, alternate_name;
   1081 	} inputs[2] = {
   1082 		{ false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
   1083 
   1084 		{ false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
   1085 	};
   1086 	int i, k, num_regs = 0;
   1087 
   1088 	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
   1089 		return 0;
   1090 	}
   1091 
   1092 	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
   1093 	while (!tgsi_parse_end_of_tokens(&parse)) {
   1094 		tgsi_parse_token(&parse);
   1095 
   1096 		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
   1097 			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
   1098 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
   1099 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
   1100 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
   1101 			{
   1102 				int interpolate, location, k;
   1103 
   1104 				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
   1105 					location = TGSI_INTERPOLATE_LOC_CENTER;
   1106 					inputs[1].enabled = true; /* needs SAMPLEID */
   1107 				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
   1108 					location = TGSI_INTERPOLATE_LOC_CENTER;
   1109 					/* Needs sample positions, currently those are always available */
   1110 				} else {
   1111 					location = TGSI_INTERPOLATE_LOC_CENTROID;
   1112 				}
   1113 
   1114 				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
   1115 				k = eg_get_interpolator_index(interpolate, location);
   1116 				ctx->eg_interpolators[k].enabled = true;
   1117 			}
   1118 		} else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
   1119 			struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
   1120 			if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
   1121 				for (k = 0; k < ARRAY_SIZE(inputs); k++) {
   1122 					if (d->Semantic.Name == inputs[k].name ||
   1123 						d->Semantic.Name == inputs[k].alternate_name) {
   1124 						inputs[k].enabled = true;
   1125 					}
   1126 				}
   1127 			}
   1128 		}
   1129 	}
   1130 
   1131 	tgsi_parse_free(&parse);
   1132 
   1133 	for (i = 0; i < ARRAY_SIZE(inputs); i++) {
   1134 		boolean enabled = inputs[i].enabled;
   1135 		int *reg = inputs[i].reg;
   1136 		unsigned name = inputs[i].name;
   1137 
   1138 		if (enabled) {
   1139 			int gpr = gpr_offset + num_regs++;
   1140 
   1141 			// add to inputs, allocate a gpr
   1142 			k = ctx->shader->ninput ++;
   1143 			ctx->shader->input[k].name = name;
   1144 			ctx->shader->input[k].sid = 0;
   1145 			ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
   1146 			ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
   1147 			*reg = ctx->shader->input[k].gpr = gpr;
   1148 		}
   1149 	}
   1150 
   1151 	return gpr_offset + num_regs;
   1152 }
   1153 
   1154 /*
   1155  * for evergreen we need to scan the shader to find the number of GPRs we need to
   1156  * reserve for interpolation and system values
   1157  *
   1158  * we need to know if we are going to emit
   1159  * any sample or centroid inputs
   1160  * if perspective and linear are required
   1161 */
   1162 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
   1163 {
   1164 	unsigned i;
   1165 	int num_baryc;
   1166 	struct tgsi_parse_context parse;
   1167 
   1168 	memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
   1169 
   1170 	for (i = 0; i < ctx->info.num_inputs; i++) {
   1171 		int k;
   1172 		/* skip position/face/mask/sampleid */
   1173 		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
   1174 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
   1175 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
   1176 		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
   1177 			continue;
   1178 
   1179 		k = eg_get_interpolator_index(
   1180 			ctx->info.input_interpolate[i],
   1181 			ctx->info.input_interpolate_loc[i]);
   1182 		if (k >= 0)
   1183 			ctx->eg_interpolators[k].enabled = TRUE;
   1184 	}
   1185 
   1186 	if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
   1187 		return 0;
   1188 	}
   1189 
   1190 	/* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
   1191 	while (!tgsi_parse_end_of_tokens(&parse)) {
   1192 		tgsi_parse_token(&parse);
   1193 
   1194 		if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
   1195 			const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
   1196 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
   1197 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
   1198 				inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
   1199 			{
   1200 				int interpolate, location, k;
   1201 
   1202 				if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
   1203 					location = TGSI_INTERPOLATE_LOC_CENTER;
   1204 				} else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
   1205 					location = TGSI_INTERPOLATE_LOC_CENTER;
   1206 				} else {
   1207 					location = TGSI_INTERPOLATE_LOC_CENTROID;
   1208 				}
   1209 
   1210 				interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
   1211 				k = eg_get_interpolator_index(interpolate, location);
   1212 				ctx->eg_interpolators[k].enabled = true;
   1213 			}
   1214 		}
   1215 	}
   1216 
   1217 	tgsi_parse_free(&parse);
   1218 
   1219 	/* assign gpr to each interpolator according to priority */
   1220 	num_baryc = 0;
   1221 	for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
   1222 		if (ctx->eg_interpolators[i].enabled) {
   1223 			ctx->eg_interpolators[i].ij_index = num_baryc;
   1224 			num_baryc ++;
   1225 		}
   1226 	}
   1227 
   1228 	/* XXX PULL MODEL and LINE STIPPLE */
   1229 
   1230 	num_baryc = (num_baryc + 1) >> 1;
   1231 	return allocate_system_value_inputs(ctx, num_baryc);
   1232 }
   1233 
   1234 /* sample_id_sel == NULL means fetch for current sample */
   1235 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
   1236 {
   1237 	struct r600_bytecode_vtx vtx;
   1238 	int r, t1;
   1239 
   1240 	assert(ctx->fixed_pt_position_gpr != -1);
   1241 
   1242 	t1 = r600_get_temp(ctx);
   1243 
   1244 	memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
   1245 	vtx.op = FETCH_OP_VFETCH;
   1246 	vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
   1247 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
   1248 	if (sample_id == NULL) {
   1249 		vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
   1250 		vtx.src_sel_x = 3;
   1251 	}
   1252 	else {
   1253 		struct r600_bytecode_alu alu;
   1254 
   1255 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1256 		alu.op = ALU_OP1_MOV;
   1257 		r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
   1258 		alu.dst.sel = t1;
   1259 		alu.dst.write = 1;
   1260 		alu.last = 1;
   1261 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   1262 		if (r)
   1263 			return r;
   1264 
   1265 		vtx.src_gpr = t1;
   1266 		vtx.src_sel_x = 0;
   1267 	}
   1268 	vtx.mega_fetch_count = 16;
   1269 	vtx.dst_gpr = t1;
   1270 	vtx.dst_sel_x = 0;
   1271 	vtx.dst_sel_y = 1;
   1272 	vtx.dst_sel_z = 2;
   1273 	vtx.dst_sel_w = 3;
   1274 	vtx.data_format = FMT_32_32_32_32_FLOAT;
   1275 	vtx.num_format_all = 2;
   1276 	vtx.format_comp_all = 1;
   1277 	vtx.use_const_fields = 0;
   1278 	vtx.offset = 1; // first element is size of buffer
   1279 	vtx.endian = r600_endian_swap(32);
   1280 	vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
   1281 
   1282 	r = r600_bytecode_add_vtx(ctx->bc, &vtx);
   1283 	if (r)
   1284 		return r;
   1285 
   1286 	return t1;
   1287 }
   1288 
   1289 static void tgsi_src(struct r600_shader_ctx *ctx,
   1290 		     const struct tgsi_full_src_register *tgsi_src,
   1291 		     struct r600_shader_src *r600_src)
   1292 {
   1293 	memset(r600_src, 0, sizeof(*r600_src));
   1294 	r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
   1295 	r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
   1296 	r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
   1297 	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
   1298 	r600_src->neg = tgsi_src->Register.Negate;
   1299 	r600_src->abs = tgsi_src->Register.Absolute;
   1300 
   1301 	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
   1302 		int index;
   1303 		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
   1304 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
   1305 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
   1306 
   1307 			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
   1308 			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
   1309 			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
   1310 				return;
   1311 		}
   1312 		index = tgsi_src->Register.Index;
   1313 		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
   1314 		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
   1315 	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
   1316 		if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
   1317 			r600_src->swizzle[0] = 2; // Z value
   1318 			r600_src->swizzle[1] = 2;
   1319 			r600_src->swizzle[2] = 2;
   1320 			r600_src->swizzle[3] = 2;
   1321 			r600_src->sel = ctx->face_gpr;
   1322 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
   1323 			r600_src->swizzle[0] = 3; // W value
   1324 			r600_src->swizzle[1] = 3;
   1325 			r600_src->swizzle[2] = 3;
   1326 			r600_src->swizzle[3] = 3;
   1327 			r600_src->sel = ctx->fixed_pt_position_gpr;
   1328 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
   1329 			r600_src->swizzle[0] = 0;
   1330 			r600_src->swizzle[1] = 1;
   1331 			r600_src->swizzle[2] = 4;
   1332 			r600_src->swizzle[3] = 4;
   1333 			r600_src->sel = load_sample_position(ctx, NULL, -1);
   1334 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
   1335 			r600_src->swizzle[0] = 3;
   1336 			r600_src->swizzle[1] = 3;
   1337 			r600_src->swizzle[2] = 3;
   1338 			r600_src->swizzle[3] = 3;
   1339 			r600_src->sel = 0;
   1340 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
   1341 			r600_src->swizzle[0] = 0;
   1342 			r600_src->swizzle[1] = 0;
   1343 			r600_src->swizzle[2] = 0;
   1344 			r600_src->swizzle[3] = 0;
   1345 			r600_src->sel = 0;
   1346 		} else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
   1347 			r600_src->swizzle[0] = 3;
   1348 			r600_src->swizzle[1] = 3;
   1349 			r600_src->swizzle[2] = 3;
   1350 			r600_src->swizzle[3] = 3;
   1351 			r600_src->sel = 1;
   1352 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
   1353 			r600_src->swizzle[0] = 2;
   1354 			r600_src->swizzle[1] = 2;
   1355 			r600_src->swizzle[2] = 2;
   1356 			r600_src->swizzle[3] = 2;
   1357 			r600_src->sel = 0;
   1358 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
   1359 			r600_src->sel = 1;
   1360 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
   1361 			r600_src->sel = 3;
   1362 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
   1363 			r600_src->sel = 2;
   1364 		} else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
   1365 			if (ctx->type == PIPE_SHADER_TESS_CTRL) {
   1366 				r600_src->sel = ctx->tess_input_info;
   1367 				r600_src->swizzle[0] = 2;
   1368 				r600_src->swizzle[1] = 2;
   1369 				r600_src->swizzle[2] = 2;
   1370 				r600_src->swizzle[3] = 2;
   1371 			} else {
   1372 				r600_src->sel = ctx->tess_input_info;
   1373 				r600_src->swizzle[0] = 3;
   1374 				r600_src->swizzle[1] = 3;
   1375 				r600_src->swizzle[2] = 3;
   1376 				r600_src->swizzle[3] = 3;
   1377 			}
   1378 		} else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
   1379 			r600_src->sel = 0;
   1380 			r600_src->swizzle[0] = 0;
   1381 			r600_src->swizzle[1] = 0;
   1382 			r600_src->swizzle[2] = 0;
   1383 			r600_src->swizzle[3] = 0;
   1384 		} else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
   1385 			r600_src->sel = 0;
   1386 			r600_src->swizzle[0] = 3;
   1387 			r600_src->swizzle[1] = 3;
   1388 			r600_src->swizzle[2] = 3;
   1389 			r600_src->swizzle[3] = 3;
   1390 		}
   1391 	} else {
   1392 		if (tgsi_src->Register.Indirect)
   1393 			r600_src->rel = V_SQ_REL_RELATIVE;
   1394 		r600_src->sel = tgsi_src->Register.Index;
   1395 		r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
   1396 	}
   1397 	if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
   1398 		if (tgsi_src->Register.Dimension) {
   1399 			r600_src->kc_bank = tgsi_src->Dimension.Index;
   1400 			if (tgsi_src->Dimension.Indirect) {
   1401 				r600_src->kc_rel = 1;
   1402 			}
   1403 		}
   1404 	}
   1405 }
   1406 
   1407 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
   1408                                 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
   1409                                 unsigned int dst_reg)
   1410 {
   1411 	struct r600_bytecode_vtx vtx;
   1412 	unsigned int ar_reg;
   1413 	int r;
   1414 
   1415 	if (offset) {
   1416 		struct r600_bytecode_alu alu;
   1417 
   1418 		memset(&alu, 0, sizeof(alu));
   1419 
   1420 		alu.op = ALU_OP2_ADD_INT;
   1421 		alu.src[0].sel = ctx->bc->ar_reg;
   1422 		alu.src[0].chan = ar_chan;
   1423 
   1424 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   1425 		alu.src[1].value = offset;
   1426 
   1427 		alu.dst.sel = dst_reg;
   1428 		alu.dst.chan = ar_chan;
   1429 		alu.dst.write = 1;
   1430 		alu.last = 1;
   1431 
   1432 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   1433 			return r;
   1434 
   1435 		ar_reg = dst_reg;
   1436 	} else {
   1437 		ar_reg = ctx->bc->ar_reg;
   1438 	}
   1439 
   1440 	memset(&vtx, 0, sizeof(vtx));
   1441 	vtx.buffer_id = cb_idx;
   1442 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
   1443 	vtx.src_gpr = ar_reg;
   1444 	vtx.src_sel_x = ar_chan;
   1445 	vtx.mega_fetch_count = 16;
   1446 	vtx.dst_gpr = dst_reg;
   1447 	vtx.dst_sel_x = 0;		/* SEL_X */
   1448 	vtx.dst_sel_y = 1;		/* SEL_Y */
   1449 	vtx.dst_sel_z = 2;		/* SEL_Z */
   1450 	vtx.dst_sel_w = 3;		/* SEL_W */
   1451 	vtx.data_format = FMT_32_32_32_32_FLOAT;
   1452 	vtx.num_format_all = 2;		/* NUM_FORMAT_SCALED */
   1453 	vtx.format_comp_all = 1;	/* FORMAT_COMP_SIGNED */
   1454 	vtx.endian = r600_endian_swap(32);
   1455 	vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
   1456 
   1457 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
   1458 		return r;
   1459 
   1460 	return 0;
   1461 }
   1462 
   1463 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
   1464 {
   1465 	struct r600_bytecode_vtx vtx;
   1466 	int r;
   1467 	unsigned index = src->Register.Index;
   1468 	unsigned vtx_id = src->Dimension.Index;
   1469 	int offset_reg = vtx_id / 3;
   1470 	int offset_chan = vtx_id % 3;
   1471 	int t2 = 0;
   1472 
   1473 	/* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
   1474 	 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
   1475 
   1476 	if (offset_reg == 0 && offset_chan == 2)
   1477 		offset_chan = 3;
   1478 
   1479 	if (src->Dimension.Indirect || src->Register.Indirect)
   1480 		t2 = r600_get_temp(ctx);
   1481 
   1482 	if (src->Dimension.Indirect) {
   1483 		int treg[3];
   1484 		struct r600_bytecode_alu alu;
   1485 		int r, i;
   1486 		unsigned addr_reg;
   1487 		addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
   1488 		if (src->DimIndirect.Index > 0) {
   1489 			r = single_alu_op2(ctx, ALU_OP1_MOV,
   1490 					   ctx->bc->ar_reg, 0,
   1491 					   addr_reg, 0,
   1492 					   0, 0);
   1493 			if (r)
   1494 				return r;
   1495 		}
   1496 		/*
   1497 		   we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
   1498 		   at least this is what fglrx seems to do. */
   1499 		for (i = 0; i < 3; i++) {
   1500 			treg[i] = r600_get_temp(ctx);
   1501 		}
   1502 		r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
   1503 
   1504 		for (i = 0; i < 3; i++) {
   1505 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1506 			alu.op = ALU_OP1_MOV;
   1507 			alu.src[0].sel = 0;
   1508 			alu.src[0].chan = i == 2 ? 3 : i;
   1509 			alu.dst.sel = treg[i];
   1510 			alu.dst.chan = 0;
   1511 			alu.dst.write = 1;
   1512 			alu.last = 1;
   1513 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   1514 			if (r)
   1515 				return r;
   1516 		}
   1517 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1518 		alu.op = ALU_OP1_MOV;
   1519 		alu.src[0].sel = treg[0];
   1520 		alu.src[0].rel = 1;
   1521 		alu.dst.sel = t2;
   1522 		alu.dst.write = 1;
   1523 		alu.last = 1;
   1524 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   1525 		if (r)
   1526 			return r;
   1527 		offset_reg = t2;
   1528 		offset_chan = 0;
   1529 	}
   1530 
   1531 	if (src->Register.Indirect) {
   1532 		int addr_reg;
   1533 		unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
   1534 
   1535 		addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
   1536 
   1537 		/* pull the value from index_reg */
   1538 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   1539 				   t2, 1,
   1540 				   addr_reg, 0,
   1541 				   V_SQ_ALU_SRC_LITERAL, first);
   1542 		if (r)
   1543 			return r;
   1544 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
   1545 				   t2, 0,
   1546 				   t2, 1,
   1547 				   V_SQ_ALU_SRC_LITERAL, 4,
   1548 				   offset_reg, offset_chan);
   1549 		if (r)
   1550 			return r;
   1551 		offset_reg = t2;
   1552 		offset_chan = 0;
   1553 		index = src->Register.Index - first;
   1554 	}
   1555 
   1556 	memset(&vtx, 0, sizeof(vtx));
   1557 	vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
   1558 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
   1559 	vtx.src_gpr = offset_reg;
   1560 	vtx.src_sel_x = offset_chan;
   1561 	vtx.offset = index * 16; /*bytes*/
   1562 	vtx.mega_fetch_count = 16;
   1563 	vtx.dst_gpr = dst_reg;
   1564 	vtx.dst_sel_x = 0;		/* SEL_X */
   1565 	vtx.dst_sel_y = 1;		/* SEL_Y */
   1566 	vtx.dst_sel_z = 2;		/* SEL_Z */
   1567 	vtx.dst_sel_w = 3;		/* SEL_W */
   1568 	if (ctx->bc->chip_class >= EVERGREEN) {
   1569 		vtx.use_const_fields = 1;
   1570 	} else {
   1571 		vtx.data_format = FMT_32_32_32_32_FLOAT;
   1572 	}
   1573 
   1574 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
   1575 		return r;
   1576 
   1577 	return 0;
   1578 }
   1579 
   1580 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
   1581 {
   1582 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1583 	unsigned i;
   1584 
   1585 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
   1586 		struct tgsi_full_src_register *src = &inst->Src[i];
   1587 
   1588 		if (src->Register.File == TGSI_FILE_INPUT) {
   1589 			if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
   1590 				/* primitive id is in R0.z */
   1591 				ctx->src[i].sel = 0;
   1592 				ctx->src[i].swizzle[0] = 2;
   1593 			}
   1594 		}
   1595 		if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
   1596 			int treg = r600_get_temp(ctx);
   1597 
   1598 			fetch_gs_input(ctx, src, treg);
   1599 			ctx->src[i].sel = treg;
   1600 			ctx->src[i].rel = 0;
   1601 		}
   1602 	}
   1603 	return 0;
   1604 }
   1605 
   1606 
   1607 /* Tessellation shaders pass outputs to the next shader using LDS.
   1608  *
   1609  * LS outputs = TCS(HS) inputs
   1610  * TCS(HS) outputs = TES(DS) inputs
   1611  *
   1612  * The LDS layout is:
   1613  * - TCS inputs for patch 0
   1614  * - TCS inputs for patch 1
   1615  * - TCS inputs for patch 2		= get_tcs_in_current_patch_offset (if RelPatchID==2)
   1616  * - ...
   1617  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
   1618  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
   1619  * - TCS outputs for patch 1
   1620  * - Per-patch TCS outputs for patch 1
   1621  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
   1622  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
   1623  * - ...
   1624  *
   1625  * All three shaders VS(LS), TCS, TES share the same LDS space.
   1626  */
   1627 /* this will return with the dw address in temp_reg.x */
   1628 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
   1629 				 const struct tgsi_full_dst_register *dst,
   1630 				 const struct tgsi_full_src_register *src,
   1631 				 int stride_bytes_reg, int stride_bytes_chan)
   1632 {
   1633 	struct tgsi_full_dst_register reg;
   1634 	ubyte *name, *index, *array_first;
   1635 	int r;
   1636 	int param;
   1637 	struct tgsi_shader_info *info = &ctx->info;
   1638 	/* Set the register description. The address computation is the same
   1639 	 * for sources and destinations. */
   1640 	if (src) {
   1641 		reg.Register.File = src->Register.File;
   1642 		reg.Register.Index = src->Register.Index;
   1643 		reg.Register.Indirect = src->Register.Indirect;
   1644 		reg.Register.Dimension = src->Register.Dimension;
   1645 		reg.Indirect = src->Indirect;
   1646 		reg.Dimension = src->Dimension;
   1647 		reg.DimIndirect = src->DimIndirect;
   1648 	} else
   1649 		reg = *dst;
   1650 
   1651 	/* If the register is 2-dimensional (e.g. an array of vertices
   1652 	 * in a primitive), calculate the base address of the vertex. */
   1653 	if (reg.Register.Dimension) {
   1654 		int sel, chan;
   1655 		if (reg.Dimension.Indirect) {
   1656 			unsigned addr_reg;
   1657 			assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
   1658 
   1659 			addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
   1660 			/* pull the value from index_reg */
   1661 			sel = addr_reg;
   1662 			chan = 0;
   1663 		} else {
   1664 			sel = V_SQ_ALU_SRC_LITERAL;
   1665 			chan = reg.Dimension.Index;
   1666 		}
   1667 
   1668 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
   1669 				   temp_reg, 0,
   1670 				   stride_bytes_reg, stride_bytes_chan,
   1671 				   sel, chan,
   1672 				   temp_reg, 0);
   1673 		if (r)
   1674 			return r;
   1675 	}
   1676 
   1677 	if (reg.Register.File == TGSI_FILE_INPUT) {
   1678 		name = info->input_semantic_name;
   1679 		index = info->input_semantic_index;
   1680 		array_first = info->input_array_first;
   1681 	} else if (reg.Register.File == TGSI_FILE_OUTPUT) {
   1682 		name = info->output_semantic_name;
   1683 		index = info->output_semantic_index;
   1684 		array_first = info->output_array_first;
   1685 	} else {
   1686 		assert(0);
   1687 		return -1;
   1688 	}
   1689 	if (reg.Register.Indirect) {
   1690 		int addr_reg;
   1691 		int first;
   1692 		/* Add the relative address of the element. */
   1693 		if (reg.Indirect.ArrayID)
   1694 			first = array_first[reg.Indirect.ArrayID];
   1695 		else
   1696 			first = reg.Register.Index;
   1697 
   1698 		addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
   1699 
   1700 		/* pull the value from index_reg */
   1701 		r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
   1702 				   temp_reg, 0,
   1703 				   V_SQ_ALU_SRC_LITERAL, 16,
   1704 				   addr_reg, 0,
   1705 				   temp_reg, 0);
   1706 		if (r)
   1707 			return r;
   1708 
   1709 		param = r600_get_lds_unique_index(name[first],
   1710 						  index[first]);
   1711 
   1712 	} else {
   1713 		param = r600_get_lds_unique_index(name[reg.Register.Index],
   1714 						  index[reg.Register.Index]);
   1715 	}
   1716 
   1717 	/* add to base_addr - passed in temp_reg.x */
   1718 	if (param) {
   1719 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   1720 				   temp_reg, 0,
   1721 				   temp_reg, 0,
   1722 				   V_SQ_ALU_SRC_LITERAL, param * 16);
   1723 		if (r)
   1724 			return r;
   1725 
   1726 	}
   1727 	return 0;
   1728 }
   1729 
   1730 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
   1731 			       unsigned dst_reg)
   1732 {
   1733 	struct r600_bytecode_alu alu;
   1734 	int r, i;
   1735 
   1736 	if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
   1737 		ctx->bc->force_add_cf = 1;
   1738 	for (i = 1; i < 4; i++) {
   1739 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   1740 				   temp_reg, i,
   1741 				   temp_reg, 0,
   1742 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
   1743 		if (r)
   1744 			return r;
   1745 	}
   1746 	for (i = 0; i < 4; i++) {
   1747 		/* emit an LDS_READ_RET */
   1748 		memset(&alu, 0, sizeof(alu));
   1749 		alu.op = LDS_OP1_LDS_READ_RET;
   1750 		alu.src[0].sel = temp_reg;
   1751 		alu.src[0].chan = i;
   1752 		alu.src[1].sel = V_SQ_ALU_SRC_0;
   1753 		alu.src[2].sel = V_SQ_ALU_SRC_0;
   1754 		alu.dst.chan = 0;
   1755 		alu.is_lds_idx_op = true;
   1756 		alu.last = 1;
   1757 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   1758 		if (r)
   1759 			return r;
   1760 	}
   1761 	for (i = 0; i < 4; i++) {
   1762 		/* then read from LDS_OQ_A_POP */
   1763 		memset(&alu, 0, sizeof(alu));
   1764 
   1765 		alu.op = ALU_OP1_MOV;
   1766 		alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
   1767 		alu.src[0].chan = 0;
   1768 		alu.dst.sel = dst_reg;
   1769 		alu.dst.chan = i;
   1770 		alu.dst.write = 1;
   1771 		alu.last = 1;
   1772 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   1773 		if (r)
   1774 			return r;
   1775 	}
   1776 	return 0;
   1777 }
   1778 
   1779 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
   1780 {
   1781 	int r;
   1782 	unsigned temp_reg = r600_get_temp(ctx);
   1783 
   1784 	r = get_lds_offset0(ctx, 2, temp_reg,
   1785 			    src->Register.Dimension ? false : true);
   1786 	if (r)
   1787 		return r;
   1788 
   1789 	/* the base address is now in temp.x */
   1790 	r = r600_get_byte_address(ctx, temp_reg,
   1791 				  NULL, src, ctx->tess_output_info, 1);
   1792 	if (r)
   1793 		return r;
   1794 
   1795 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
   1796 	if (r)
   1797 		return r;
   1798 	return 0;
   1799 }
   1800 
   1801 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
   1802 {
   1803 	int r;
   1804 	unsigned temp_reg = r600_get_temp(ctx);
   1805 
   1806 	/* t.x = ips * r0.y */
   1807 	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
   1808 			   temp_reg, 0,
   1809 			   ctx->tess_input_info, 0,
   1810 			   0, 1);
   1811 
   1812 	if (r)
   1813 		return r;
   1814 
   1815 	/* the base address is now in temp.x */
   1816 	r = r600_get_byte_address(ctx, temp_reg,
   1817 				  NULL, src, ctx->tess_input_info, 1);
   1818 	if (r)
   1819 		return r;
   1820 
   1821 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
   1822 	if (r)
   1823 		return r;
   1824 	return 0;
   1825 }
   1826 
   1827 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
   1828 {
   1829 	int r;
   1830 	unsigned temp_reg = r600_get_temp(ctx);
   1831 
   1832 	r = get_lds_offset0(ctx, 1, temp_reg,
   1833 			    src->Register.Dimension ? false : true);
   1834 	if (r)
   1835 		return r;
   1836 	/* the base address is now in temp.x */
   1837 	r = r600_get_byte_address(ctx, temp_reg,
   1838 				  NULL, src,
   1839 				  ctx->tess_output_info, 1);
   1840 	if (r)
   1841 		return r;
   1842 
   1843 	r = do_lds_fetch_values(ctx, temp_reg, dst_reg);
   1844 	if (r)
   1845 		return r;
   1846 	return 0;
   1847 }
   1848 
   1849 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
   1850 {
   1851 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1852 	unsigned i;
   1853 
   1854 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
   1855 		struct tgsi_full_src_register *src = &inst->Src[i];
   1856 
   1857 		if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
   1858 			int treg = r600_get_temp(ctx);
   1859 			fetch_tes_input(ctx, src, treg);
   1860 			ctx->src[i].sel = treg;
   1861 			ctx->src[i].rel = 0;
   1862 		}
   1863 		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
   1864 			int treg = r600_get_temp(ctx);
   1865 			fetch_tcs_input(ctx, src, treg);
   1866 			ctx->src[i].sel = treg;
   1867 			ctx->src[i].rel = 0;
   1868 		}
   1869 		if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
   1870 			int treg = r600_get_temp(ctx);
   1871 			fetch_tcs_output(ctx, src, treg);
   1872 			ctx->src[i].sel = treg;
   1873 			ctx->src[i].rel = 0;
   1874 		}
   1875 	}
   1876 	return 0;
   1877 }
   1878 
   1879 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
   1880 {
   1881 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1882 	struct r600_bytecode_alu alu;
   1883 	int i, j, k, nconst, r;
   1884 
   1885 	for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
   1886 		if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
   1887 			nconst++;
   1888 		}
   1889 		tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
   1890 	}
   1891 	for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
   1892 		if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
   1893 			continue;
   1894 		}
   1895 
   1896 		if (ctx->src[i].rel) {
   1897 			int chan = inst->Src[i].Indirect.Swizzle;
   1898 			int treg = r600_get_temp(ctx);
   1899 			if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
   1900 				return r;
   1901 
   1902 			ctx->src[i].kc_bank = 0;
   1903 			ctx->src[i].kc_rel = 0;
   1904 			ctx->src[i].sel = treg;
   1905 			ctx->src[i].rel = 0;
   1906 			j--;
   1907 		} else if (j > 0) {
   1908 			int treg = r600_get_temp(ctx);
   1909 			for (k = 0; k < 4; k++) {
   1910 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1911 				alu.op = ALU_OP1_MOV;
   1912 				alu.src[0].sel = ctx->src[i].sel;
   1913 				alu.src[0].chan = k;
   1914 				alu.src[0].rel = ctx->src[i].rel;
   1915 				alu.src[0].kc_bank = ctx->src[i].kc_bank;
   1916 				alu.src[0].kc_rel = ctx->src[i].kc_rel;
   1917 				alu.dst.sel = treg;
   1918 				alu.dst.chan = k;
   1919 				alu.dst.write = 1;
   1920 				if (k == 3)
   1921 					alu.last = 1;
   1922 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   1923 				if (r)
   1924 					return r;
   1925 			}
   1926 			ctx->src[i].sel = treg;
   1927 			ctx->src[i].rel =0;
   1928 			j--;
   1929 		}
   1930 	}
   1931 	return 0;
   1932 }
   1933 
   1934 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
   1935 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
   1936 {
   1937 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   1938 	struct r600_bytecode_alu alu;
   1939 	int i, j, k, nliteral, r;
   1940 
   1941 	for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
   1942 		if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
   1943 			nliteral++;
   1944 		}
   1945 	}
   1946 	for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
   1947 		if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
   1948 			int treg = r600_get_temp(ctx);
   1949 			for (k = 0; k < 4; k++) {
   1950 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   1951 				alu.op = ALU_OP1_MOV;
   1952 				alu.src[0].sel = ctx->src[i].sel;
   1953 				alu.src[0].chan = k;
   1954 				alu.src[0].value = ctx->src[i].value[k];
   1955 				alu.dst.sel = treg;
   1956 				alu.dst.chan = k;
   1957 				alu.dst.write = 1;
   1958 				if (k == 3)
   1959 					alu.last = 1;
   1960 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   1961 				if (r)
   1962 					return r;
   1963 			}
   1964 			ctx->src[i].sel = treg;
   1965 			j--;
   1966 		}
   1967 	}
   1968 	return 0;
   1969 }
   1970 
   1971 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
   1972 {
   1973 	int i, r, count = ctx->shader->ninput;
   1974 
   1975 	for (i = 0; i < count; i++) {
   1976 		if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
   1977 			r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
   1978 			if (r)
   1979 				return r;
   1980 		}
   1981 	}
   1982 	return 0;
   1983 }
   1984 
   1985 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
   1986 						  int stream, unsigned *stream_item_size)
   1987 {
   1988 	unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
   1989 	unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
   1990 	int i, j, r;
   1991 
   1992 	/* Sanity checking. */
   1993 	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
   1994 		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
   1995 		r = -EINVAL;
   1996 		goto out_err;
   1997 	}
   1998 	for (i = 0; i < so->num_outputs; i++) {
   1999 		if (so->output[i].output_buffer >= 4) {
   2000 			R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
   2001 				 so->output[i].output_buffer);
   2002 			r = -EINVAL;
   2003 			goto out_err;
   2004 		}
   2005 	}
   2006 
   2007 	/* Initialize locations where the outputs are stored. */
   2008 	for (i = 0; i < so->num_outputs; i++) {
   2009 
   2010 		so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
   2011 		start_comp[i] = so->output[i].start_component;
   2012 		/* Lower outputs with dst_offset < start_component.
   2013 		 *
   2014 		 * We can only output 4D vectors with a write mask, e.g. we can
   2015 		 * only output the W component at offset 3, etc. If we want
   2016 		 * to store Y, Z, or W at buffer offset 0, we need to use MOV
   2017 		 * to move it to X and output X. */
   2018 		if (so->output[i].dst_offset < so->output[i].start_component) {
   2019 			unsigned tmp = r600_get_temp(ctx);
   2020 
   2021 			for (j = 0; j < so->output[i].num_components; j++) {
   2022 				struct r600_bytecode_alu alu;
   2023 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2024 				alu.op = ALU_OP1_MOV;
   2025 				alu.src[0].sel = so_gpr[i];
   2026 				alu.src[0].chan = so->output[i].start_component + j;
   2027 
   2028 				alu.dst.sel = tmp;
   2029 				alu.dst.chan = j;
   2030 				alu.dst.write = 1;
   2031 				if (j == so->output[i].num_components - 1)
   2032 					alu.last = 1;
   2033 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   2034 				if (r)
   2035 					return r;
   2036 			}
   2037 			start_comp[i] = 0;
   2038 			so_gpr[i] = tmp;
   2039 		}
   2040 	}
   2041 
   2042 	/* Write outputs to buffers. */
   2043 	for (i = 0; i < so->num_outputs; i++) {
   2044 		struct r600_bytecode_output output;
   2045 
   2046 		if (stream != -1 && stream != so->output[i].output_buffer)
   2047 			continue;
   2048 
   2049 		memset(&output, 0, sizeof(struct r600_bytecode_output));
   2050 		output.gpr = so_gpr[i];
   2051 		output.elem_size = so->output[i].num_components - 1;
   2052 		if (output.elem_size == 2)
   2053 			output.elem_size = 3; // 3 not supported, write 4 with junk at end
   2054 		output.array_base = so->output[i].dst_offset - start_comp[i];
   2055 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
   2056 		output.burst_count = 1;
   2057 		/* array_size is an upper limit for the burst_count
   2058 		 * with MEM_STREAM instructions */
   2059 		output.array_size = 0xFFF;
   2060 		output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
   2061 
   2062 		if (ctx->bc->chip_class >= EVERGREEN) {
   2063 			switch (so->output[i].output_buffer) {
   2064 			case 0:
   2065 				output.op = CF_OP_MEM_STREAM0_BUF0;
   2066 				break;
   2067 			case 1:
   2068 				output.op = CF_OP_MEM_STREAM0_BUF1;
   2069 				break;
   2070 			case 2:
   2071 				output.op = CF_OP_MEM_STREAM0_BUF2;
   2072 				break;
   2073 			case 3:
   2074 				output.op = CF_OP_MEM_STREAM0_BUF3;
   2075 				break;
   2076 			}
   2077 			output.op += so->output[i].stream * 4;
   2078 			assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
   2079 			ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
   2080 		} else {
   2081 			switch (so->output[i].output_buffer) {
   2082 			case 0:
   2083 				output.op = CF_OP_MEM_STREAM0;
   2084 				break;
   2085 			case 1:
   2086 				output.op = CF_OP_MEM_STREAM1;
   2087 				break;
   2088 			case 2:
   2089 				output.op = CF_OP_MEM_STREAM2;
   2090 				break;
   2091 			case 3:
   2092 				output.op = CF_OP_MEM_STREAM3;
   2093 					break;
   2094 			}
   2095 			ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
   2096 		}
   2097 		r = r600_bytecode_add_output(ctx->bc, &output);
   2098 		if (r)
   2099 			goto out_err;
   2100 	}
   2101 	return 0;
   2102 out_err:
   2103 	return r;
   2104 }
   2105 
   2106 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
   2107 {
   2108 	struct r600_bytecode_alu alu;
   2109 	unsigned reg;
   2110 
   2111 	if (!ctx->shader->vs_out_edgeflag)
   2112 		return;
   2113 
   2114 	reg = ctx->shader->output[ctx->edgeflag_output].gpr;
   2115 
   2116 	/* clamp(x, 0, 1) */
   2117 	memset(&alu, 0, sizeof(alu));
   2118 	alu.op = ALU_OP1_MOV;
   2119 	alu.src[0].sel = reg;
   2120 	alu.dst.sel = reg;
   2121 	alu.dst.write = 1;
   2122 	alu.dst.clamp = 1;
   2123 	alu.last = 1;
   2124 	r600_bytecode_add_alu(ctx->bc, &alu);
   2125 
   2126 	memset(&alu, 0, sizeof(alu));
   2127 	alu.op = ALU_OP1_FLT_TO_INT;
   2128 	alu.src[0].sel = reg;
   2129 	alu.dst.sel = reg;
   2130 	alu.dst.write = 1;
   2131 	alu.last = 1;
   2132 	r600_bytecode_add_alu(ctx->bc, &alu);
   2133 }
   2134 
   2135 static int generate_gs_copy_shader(struct r600_context *rctx,
   2136 				   struct r600_pipe_shader *gs,
   2137 				   struct pipe_stream_output_info *so)
   2138 {
   2139 	struct r600_shader_ctx ctx = {};
   2140 	struct r600_shader *gs_shader = &gs->shader;
   2141 	struct r600_pipe_shader *cshader;
   2142 	int ocnt = gs_shader->noutput;
   2143 	struct r600_bytecode_alu alu;
   2144 	struct r600_bytecode_vtx vtx;
   2145 	struct r600_bytecode_output output;
   2146 	struct r600_bytecode_cf *cf_jump, *cf_pop,
   2147 		*last_exp_pos = NULL, *last_exp_param = NULL;
   2148 	int i, j, next_clip_pos = 61, next_param = 0;
   2149 	int ring;
   2150 	bool only_ring_0 = true;
   2151 	cshader = calloc(1, sizeof(struct r600_pipe_shader));
   2152 	if (!cshader)
   2153 		return 0;
   2154 
   2155 	memcpy(cshader->shader.output, gs_shader->output, ocnt *
   2156 	       sizeof(struct r600_shader_io));
   2157 
   2158 	cshader->shader.noutput = ocnt;
   2159 
   2160 	ctx.shader = &cshader->shader;
   2161 	ctx.bc = &ctx.shader->bc;
   2162 	ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
   2163 
   2164 	r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
   2165 			   rctx->screen->has_compressed_msaa_texturing);
   2166 
   2167 	ctx.bc->isa = rctx->isa;
   2168 
   2169 	cf_jump = NULL;
   2170 	memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
   2171 
   2172 	/* R0.x = R0.x & 0x3fffffff */
   2173 	memset(&alu, 0, sizeof(alu));
   2174 	alu.op = ALU_OP2_AND_INT;
   2175 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   2176 	alu.src[1].value = 0x3fffffff;
   2177 	alu.dst.write = 1;
   2178 	r600_bytecode_add_alu(ctx.bc, &alu);
   2179 
   2180 	/* R0.y = R0.x >> 30 */
   2181 	memset(&alu, 0, sizeof(alu));
   2182 	alu.op = ALU_OP2_LSHR_INT;
   2183 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   2184 	alu.src[1].value = 0x1e;
   2185 	alu.dst.chan = 1;
   2186 	alu.dst.write = 1;
   2187 	alu.last = 1;
   2188 	r600_bytecode_add_alu(ctx.bc, &alu);
   2189 
   2190 	/* fetch vertex data from GSVS ring */
   2191 	for (i = 0; i < ocnt; ++i) {
   2192 		struct r600_shader_io *out = &ctx.shader->output[i];
   2193 
   2194 		out->gpr = i + 1;
   2195 		out->ring_offset = i * 16;
   2196 
   2197 		memset(&vtx, 0, sizeof(vtx));
   2198 		vtx.op = FETCH_OP_VFETCH;
   2199 		vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
   2200 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
   2201 		vtx.mega_fetch_count = 16;
   2202 		vtx.offset = out->ring_offset;
   2203 		vtx.dst_gpr = out->gpr;
   2204 		vtx.src_gpr = 0;
   2205 		vtx.dst_sel_x = 0;
   2206 		vtx.dst_sel_y = 1;
   2207 		vtx.dst_sel_z = 2;
   2208 		vtx.dst_sel_w = 3;
   2209 		if (rctx->b.chip_class >= EVERGREEN) {
   2210 			vtx.use_const_fields = 1;
   2211 		} else {
   2212 			vtx.data_format = FMT_32_32_32_32_FLOAT;
   2213 		}
   2214 
   2215 		r600_bytecode_add_vtx(ctx.bc, &vtx);
   2216 	}
   2217 	ctx.temp_reg = i + 1;
   2218 	for (ring = 3; ring >= 0; --ring) {
   2219 		bool enabled = false;
   2220 		for (i = 0; i < so->num_outputs; i++) {
   2221 			if (so->output[i].stream == ring) {
   2222 				enabled = true;
   2223 				if (ring > 0)
   2224 					only_ring_0 = false;
   2225 				break;
   2226 			}
   2227 		}
   2228 		if (ring != 0 && !enabled) {
   2229 			cshader->shader.ring_item_sizes[ring] = 0;
   2230 			continue;
   2231 		}
   2232 
   2233 		if (cf_jump) {
   2234 			// Patch up jump label
   2235 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
   2236 			cf_pop = ctx.bc->cf_last;
   2237 
   2238 			cf_jump->cf_addr = cf_pop->id + 2;
   2239 			cf_jump->pop_count = 1;
   2240 			cf_pop->cf_addr = cf_pop->id + 2;
   2241 			cf_pop->pop_count = 1;
   2242 		}
   2243 
   2244 		/* PRED_SETE_INT __, R0.y, ring */
   2245 		memset(&alu, 0, sizeof(alu));
   2246 		alu.op = ALU_OP2_PRED_SETE_INT;
   2247 		alu.src[0].chan = 1;
   2248 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   2249 		alu.src[1].value = ring;
   2250 		alu.execute_mask = 1;
   2251 		alu.update_pred = 1;
   2252 		alu.last = 1;
   2253 		r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
   2254 
   2255 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
   2256 		cf_jump = ctx.bc->cf_last;
   2257 
   2258 		if (enabled)
   2259 			emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
   2260 		cshader->shader.ring_item_sizes[ring] = ocnt * 16;
   2261 	}
   2262 
   2263 	/* bc adds nops - copy it */
   2264 	if (ctx.bc->chip_class == R600) {
   2265 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2266 		alu.op = ALU_OP0_NOP;
   2267 		alu.last = 1;
   2268 		r600_bytecode_add_alu(ctx.bc, &alu);
   2269 
   2270 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
   2271 	}
   2272 
   2273 	/* export vertex data */
   2274 	/* XXX factor out common code with r600_shader_from_tgsi ? */
   2275 	for (i = 0; i < ocnt; ++i) {
   2276 		struct r600_shader_io *out = &ctx.shader->output[i];
   2277 		bool instream0 = true;
   2278 		if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
   2279 			continue;
   2280 
   2281 		for (j = 0; j < so->num_outputs; j++) {
   2282 			if (so->output[j].register_index == i) {
   2283 				if (so->output[j].stream == 0)
   2284 					break;
   2285 				if (so->output[j].stream > 0)
   2286 					instream0 = false;
   2287 			}
   2288 		}
   2289 		if (!instream0)
   2290 			continue;
   2291 		memset(&output, 0, sizeof(output));
   2292 		output.gpr = out->gpr;
   2293 		output.elem_size = 3;
   2294 		output.swizzle_x = 0;
   2295 		output.swizzle_y = 1;
   2296 		output.swizzle_z = 2;
   2297 		output.swizzle_w = 3;
   2298 		output.burst_count = 1;
   2299 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   2300 		output.op = CF_OP_EXPORT;
   2301 		switch (out->name) {
   2302 		case TGSI_SEMANTIC_POSITION:
   2303 			output.array_base = 60;
   2304 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   2305 			break;
   2306 
   2307 		case TGSI_SEMANTIC_PSIZE:
   2308 			output.array_base = 61;
   2309 			if (next_clip_pos == 61)
   2310 				next_clip_pos = 62;
   2311 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   2312 			output.swizzle_y = 7;
   2313 			output.swizzle_z = 7;
   2314 			output.swizzle_w = 7;
   2315 			ctx.shader->vs_out_misc_write = 1;
   2316 			ctx.shader->vs_out_point_size = 1;
   2317 			break;
   2318 		case TGSI_SEMANTIC_LAYER:
   2319 			if (out->spi_sid) {
   2320 				/* duplicate it as PARAM to pass to the pixel shader */
   2321 				output.array_base = next_param++;
   2322 				r600_bytecode_add_output(ctx.bc, &output);
   2323 				last_exp_param = ctx.bc->cf_last;
   2324 			}
   2325 			output.array_base = 61;
   2326 			if (next_clip_pos == 61)
   2327 				next_clip_pos = 62;
   2328 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   2329 			output.swizzle_x = 7;
   2330 			output.swizzle_y = 7;
   2331 			output.swizzle_z = 0;
   2332 			output.swizzle_w = 7;
   2333 			ctx.shader->vs_out_misc_write = 1;
   2334 			ctx.shader->vs_out_layer = 1;
   2335 			break;
   2336 		case TGSI_SEMANTIC_VIEWPORT_INDEX:
   2337 			if (out->spi_sid) {
   2338 				/* duplicate it as PARAM to pass to the pixel shader */
   2339 				output.array_base = next_param++;
   2340 				r600_bytecode_add_output(ctx.bc, &output);
   2341 				last_exp_param = ctx.bc->cf_last;
   2342 			}
   2343 			output.array_base = 61;
   2344 			if (next_clip_pos == 61)
   2345 				next_clip_pos = 62;
   2346 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   2347 			ctx.shader->vs_out_misc_write = 1;
   2348 			ctx.shader->vs_out_viewport = 1;
   2349 			output.swizzle_x = 7;
   2350 			output.swizzle_y = 7;
   2351 			output.swizzle_z = 7;
   2352 			output.swizzle_w = 0;
   2353 			break;
   2354 		case TGSI_SEMANTIC_CLIPDIST:
   2355 			/* spi_sid is 0 for clipdistance outputs that were generated
   2356 			 * for clipvertex - we don't need to pass them to PS */
   2357 			ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
   2358 			if (out->spi_sid) {
   2359 				/* duplicate it as PARAM to pass to the pixel shader */
   2360 				output.array_base = next_param++;
   2361 				r600_bytecode_add_output(ctx.bc, &output);
   2362 				last_exp_param = ctx.bc->cf_last;
   2363 			}
   2364 			output.array_base = next_clip_pos++;
   2365 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   2366 			break;
   2367 		case TGSI_SEMANTIC_FOG:
   2368 			output.swizzle_y = 4; /* 0 */
   2369 			output.swizzle_z = 4; /* 0 */
   2370 			output.swizzle_w = 5; /* 1 */
   2371 			break;
   2372 		default:
   2373 			output.array_base = next_param++;
   2374 			break;
   2375 		}
   2376 		r600_bytecode_add_output(ctx.bc, &output);
   2377 		if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
   2378 			last_exp_param = ctx.bc->cf_last;
   2379 		else
   2380 			last_exp_pos = ctx.bc->cf_last;
   2381 	}
   2382 
   2383 	if (!last_exp_pos) {
   2384 		memset(&output, 0, sizeof(output));
   2385 		output.gpr = 0;
   2386 		output.elem_size = 3;
   2387 		output.swizzle_x = 7;
   2388 		output.swizzle_y = 7;
   2389 		output.swizzle_z = 7;
   2390 		output.swizzle_w = 7;
   2391 		output.burst_count = 1;
   2392 		output.type = 2;
   2393 		output.op = CF_OP_EXPORT;
   2394 		output.array_base = 60;
   2395 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   2396 		r600_bytecode_add_output(ctx.bc, &output);
   2397 		last_exp_pos = ctx.bc->cf_last;
   2398 	}
   2399 
   2400 	if (!last_exp_param) {
   2401 		memset(&output, 0, sizeof(output));
   2402 		output.gpr = 0;
   2403 		output.elem_size = 3;
   2404 		output.swizzle_x = 7;
   2405 		output.swizzle_y = 7;
   2406 		output.swizzle_z = 7;
   2407 		output.swizzle_w = 7;
   2408 		output.burst_count = 1;
   2409 		output.type = 2;
   2410 		output.op = CF_OP_EXPORT;
   2411 		output.array_base = next_param++;
   2412 		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   2413 		r600_bytecode_add_output(ctx.bc, &output);
   2414 		last_exp_param = ctx.bc->cf_last;
   2415 	}
   2416 
   2417 	last_exp_pos->op = CF_OP_EXPORT_DONE;
   2418 	last_exp_param->op = CF_OP_EXPORT_DONE;
   2419 
   2420 	r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
   2421 	cf_pop = ctx.bc->cf_last;
   2422 
   2423 	cf_jump->cf_addr = cf_pop->id + 2;
   2424 	cf_jump->pop_count = 1;
   2425 	cf_pop->cf_addr = cf_pop->id + 2;
   2426 	cf_pop->pop_count = 1;
   2427 
   2428 	if (ctx.bc->chip_class == CAYMAN)
   2429 		cm_bytecode_add_cf_end(ctx.bc);
   2430 	else {
   2431 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
   2432 		ctx.bc->cf_last->end_of_program = 1;
   2433 	}
   2434 
   2435 	gs->gs_copy_shader = cshader;
   2436 	cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
   2437 
   2438 	ctx.bc->nstack = 1;
   2439 
   2440 	return r600_bytecode_build(ctx.bc);
   2441 }
   2442 
   2443 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
   2444 {
   2445 	if (ind) {
   2446 		struct r600_bytecode_alu alu;
   2447 		int r;
   2448 
   2449 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2450 		alu.op = ALU_OP2_ADD_INT;
   2451 		alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
   2452 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   2453 		alu.src[1].value = ctx->gs_out_ring_offset >> 4;
   2454 		alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
   2455 		alu.dst.write = 1;
   2456 		alu.last = 1;
   2457 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2458 		if (r)
   2459 			return r;
   2460 	}
   2461 	return 0;
   2462 }
   2463 
   2464 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
   2465 {
   2466 	struct r600_bytecode_output output;
   2467 	int i, k, ring_offset;
   2468 	int effective_stream = stream == -1 ? 0 : stream;
   2469 	int idx = 0;
   2470 
   2471 	for (i = 0; i < ctx->shader->noutput; i++) {
   2472 		if (ctx->gs_for_vs) {
   2473 			/* for ES we need to lookup corresponding ring offset expected by GS
   2474 			 * (map this output to GS input by name and sid) */
   2475 			/* FIXME precompute offsets */
   2476 			ring_offset = -1;
   2477 			for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
   2478 				struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
   2479 				struct r600_shader_io *out = &ctx->shader->output[i];
   2480 				if (in->name == out->name && in->sid == out->sid)
   2481 					ring_offset = in->ring_offset;
   2482 			}
   2483 
   2484 			if (ring_offset == -1)
   2485 				continue;
   2486 		} else {
   2487 			ring_offset = idx * 16;
   2488 			idx++;
   2489 		}
   2490 
   2491 		if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
   2492 			continue;
   2493 		/* next_ring_offset after parsing input decls contains total size of
   2494 		 * single vertex data, gs_next_vertex - current vertex index */
   2495 		if (!ind)
   2496 			ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
   2497 
   2498 		memset(&output, 0, sizeof(struct r600_bytecode_output));
   2499 		output.gpr = ctx->shader->output[i].gpr;
   2500 		output.elem_size = 3;
   2501 		output.comp_mask = 0xF;
   2502 		output.burst_count = 1;
   2503 
   2504 		if (ind)
   2505 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
   2506 		else
   2507 			output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
   2508 
   2509 		switch (stream) {
   2510 		default:
   2511 		case 0:
   2512 			output.op = CF_OP_MEM_RING; break;
   2513 		case 1:
   2514 			output.op = CF_OP_MEM_RING1; break;
   2515 		case 2:
   2516 			output.op = CF_OP_MEM_RING2; break;
   2517 		case 3:
   2518 			output.op = CF_OP_MEM_RING3; break;
   2519 		}
   2520 
   2521 		if (ind) {
   2522 			output.array_base = ring_offset >> 2; /* in dwords */
   2523 			output.array_size = 0xfff;
   2524 			output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
   2525 		} else
   2526 			output.array_base = ring_offset >> 2; /* in dwords */
   2527 		r600_bytecode_add_output(ctx->bc, &output);
   2528 	}
   2529 
   2530 	++ctx->gs_next_vertex;
   2531 	return 0;
   2532 }
   2533 
   2534 
   2535 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
   2536 {
   2537 	int r;
   2538 	struct r600_bytecode_vtx vtx;
   2539 	int temp_val = ctx->temp_reg;
   2540 	/* need to store the TCS output somewhere */
   2541 	r = single_alu_op2(ctx, ALU_OP1_MOV,
   2542 			   temp_val, 0,
   2543 			   V_SQ_ALU_SRC_LITERAL, 0,
   2544 			   0, 0);
   2545 	if (r)
   2546 		return r;
   2547 
   2548 	/* used by VS/TCS */
   2549 	if (ctx->tess_input_info) {
   2550 		/* fetch tcs input values into resv space */
   2551 		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
   2552 		vtx.op = FETCH_OP_VFETCH;
   2553 		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
   2554 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
   2555 		vtx.mega_fetch_count = 16;
   2556 		vtx.data_format = FMT_32_32_32_32;
   2557 		vtx.num_format_all = 2;
   2558 		vtx.format_comp_all = 1;
   2559 		vtx.use_const_fields = 0;
   2560 		vtx.endian = r600_endian_swap(32);
   2561 		vtx.srf_mode_all = 1;
   2562 		vtx.offset = 0;
   2563 		vtx.dst_gpr = ctx->tess_input_info;
   2564 		vtx.dst_sel_x = 0;
   2565 		vtx.dst_sel_y = 1;
   2566 		vtx.dst_sel_z = 2;
   2567 		vtx.dst_sel_w = 3;
   2568 		vtx.src_gpr = temp_val;
   2569 		vtx.src_sel_x = 0;
   2570 
   2571 		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
   2572 		if (r)
   2573 			return r;
   2574 	}
   2575 
   2576 	/* used by TCS/TES */
   2577 	if (ctx->tess_output_info) {
   2578 		/* fetch tcs output values into resv space */
   2579 		memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
   2580 		vtx.op = FETCH_OP_VFETCH;
   2581 		vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
   2582 		vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
   2583 		vtx.mega_fetch_count = 16;
   2584 		vtx.data_format = FMT_32_32_32_32;
   2585 		vtx.num_format_all = 2;
   2586 		vtx.format_comp_all = 1;
   2587 		vtx.use_const_fields = 0;
   2588 		vtx.endian = r600_endian_swap(32);
   2589 		vtx.srf_mode_all = 1;
   2590 		vtx.offset = 16;
   2591 		vtx.dst_gpr = ctx->tess_output_info;
   2592 		vtx.dst_sel_x = 0;
   2593 		vtx.dst_sel_y = 1;
   2594 		vtx.dst_sel_z = 2;
   2595 		vtx.dst_sel_w = 3;
   2596 		vtx.src_gpr = temp_val;
   2597 		vtx.src_sel_x = 0;
   2598 
   2599 		r = r600_bytecode_add_vtx(ctx->bc, &vtx);
   2600 		if (r)
   2601 			return r;
   2602 	}
   2603 	return 0;
   2604 }
   2605 
   2606 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
   2607 {
   2608 	int i, j, r;
   2609 	int temp_reg;
   2610 
   2611 	/* fetch tcs input values into input_vals */
   2612 	ctx->tess_input_info = r600_get_temp(ctx);
   2613 	ctx->tess_output_info = 0;
   2614 	r = r600_fetch_tess_io_info(ctx);
   2615 	if (r)
   2616 		return r;
   2617 
   2618 	temp_reg = r600_get_temp(ctx);
   2619 	/* dst reg contains LDS address stride * idx */
   2620 	/* MUL vertexID, vertex_dw_stride */
   2621 	r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
   2622 			   temp_reg, 0,
   2623 			   ctx->tess_input_info, 1,
   2624 			   0, 1); /* rel id in r0.y? */
   2625 	if (r)
   2626 		return r;
   2627 
   2628 	for (i = 0; i < ctx->shader->noutput; i++) {
   2629 		struct r600_bytecode_alu alu;
   2630 		int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
   2631 
   2632 		if (param) {
   2633 			r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   2634 					   temp_reg, 1,
   2635 					   temp_reg, 0,
   2636 					   V_SQ_ALU_SRC_LITERAL, param * 16);
   2637 			if (r)
   2638 				return r;
   2639 		}
   2640 
   2641 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   2642 				   temp_reg, 2,
   2643 				   temp_reg, param ? 1 : 0,
   2644 				   V_SQ_ALU_SRC_LITERAL, 8);
   2645 		if (r)
   2646 			return r;
   2647 
   2648 
   2649 		for (j = 0; j < 2; j++) {
   2650 			int chan = (j == 1) ? 2 : (param ? 1 : 0);
   2651 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2652 			alu.op = LDS_OP3_LDS_WRITE_REL;
   2653 			alu.src[0].sel = temp_reg;
   2654 			alu.src[0].chan = chan;
   2655 			alu.src[1].sel = ctx->shader->output[i].gpr;
   2656 			alu.src[1].chan = j * 2;
   2657 			alu.src[2].sel = ctx->shader->output[i].gpr;
   2658 			alu.src[2].chan = (j * 2) + 1;
   2659 			alu.last = 1;
   2660 			alu.dst.chan = 0;
   2661 			alu.lds_idx = 1;
   2662 			alu.is_lds_idx_op = true;
   2663 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   2664 			if (r)
   2665 				return r;
   2666 		}
   2667 	}
   2668 	return 0;
   2669 }
   2670 
   2671 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
   2672 {
   2673 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   2674 	const struct tgsi_full_dst_register *dst = &inst->Dst[0];
   2675 	int i, r, lasti;
   2676 	int temp_reg = r600_get_temp(ctx);
   2677 	struct r600_bytecode_alu alu;
   2678 	unsigned write_mask = dst->Register.WriteMask;
   2679 
   2680 	if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
   2681 		return 0;
   2682 
   2683 	r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
   2684 	if (r)
   2685 		return r;
   2686 
   2687 	/* the base address is now in temp.x */
   2688 	r = r600_get_byte_address(ctx, temp_reg,
   2689 				  &inst->Dst[0], NULL, ctx->tess_output_info, 1);
   2690 	if (r)
   2691 		return r;
   2692 
   2693 	/* LDS write */
   2694 	lasti = tgsi_last_instruction(write_mask);
   2695 	for (i = 1; i <= lasti; i++) {
   2696 
   2697 		if (!(write_mask & (1 << i)))
   2698 			continue;
   2699 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   2700 				   temp_reg, i,
   2701 				   temp_reg, 0,
   2702 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
   2703 		if (r)
   2704 			return r;
   2705 	}
   2706 
   2707 	for (i = 0; i <= lasti; i++) {
   2708 		if (!(write_mask & (1 << i)))
   2709 			continue;
   2710 
   2711 		if ((i == 0 && ((write_mask & 3) == 3)) ||
   2712 		    (i == 2 && ((write_mask & 0xc) == 0xc))) {
   2713 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2714 			alu.op = LDS_OP3_LDS_WRITE_REL;
   2715 			alu.src[0].sel = temp_reg;
   2716 			alu.src[0].chan = i;
   2717 
   2718 			alu.src[1].sel = dst->Register.Index;
   2719 			alu.src[1].sel += ctx->file_offset[dst->Register.File];
   2720 			alu.src[1].chan = i;
   2721 
   2722 			alu.src[2].sel = dst->Register.Index;
   2723 			alu.src[2].sel += ctx->file_offset[dst->Register.File];
   2724 			alu.src[2].chan = i + 1;
   2725 			alu.lds_idx = 1;
   2726 			alu.dst.chan = 0;
   2727 			alu.last = 1;
   2728 			alu.is_lds_idx_op = true;
   2729 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   2730 			if (r)
   2731 				return r;
   2732 			i += 1;
   2733 			continue;
   2734 		}
   2735 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   2736 		alu.op = LDS_OP2_LDS_WRITE;
   2737 		alu.src[0].sel = temp_reg;
   2738 		alu.src[0].chan = i;
   2739 
   2740 		alu.src[1].sel = dst->Register.Index;
   2741 		alu.src[1].sel += ctx->file_offset[dst->Register.File];
   2742 		alu.src[1].chan = i;
   2743 
   2744 		alu.src[2].sel = V_SQ_ALU_SRC_0;
   2745 		alu.dst.chan = 0;
   2746 		alu.last = 1;
   2747 		alu.is_lds_idx_op = true;
   2748 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   2749 		if (r)
   2750 			return r;
   2751 	}
   2752 	return 0;
   2753 }
   2754 
   2755 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
   2756 				 int output_idx)
   2757 {
   2758 	int param;
   2759 	unsigned temp_reg = r600_get_temp(ctx);
   2760 	unsigned name = ctx->shader->output[output_idx].name;
   2761 	int dreg = ctx->shader->output[output_idx].gpr;
   2762 	int r;
   2763 
   2764 	param = r600_get_lds_unique_index(name, 0);
   2765 	r = get_lds_offset0(ctx, 1, temp_reg, true);
   2766 	if (r)
   2767 		return r;
   2768 
   2769 	r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   2770 			   temp_reg, 0,
   2771 			   temp_reg, 0,
   2772 			   V_SQ_ALU_SRC_LITERAL, param * 16);
   2773 	if (r)
   2774 		return r;
   2775 
   2776 	do_lds_fetch_values(ctx, temp_reg, dreg);
   2777 	return 0;
   2778 }
   2779 
   2780 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
   2781 {
   2782 	unsigned i;
   2783 	int stride, outer_comps, inner_comps;
   2784 	int tessinner_idx = -1, tessouter_idx = -1;
   2785 	int r;
   2786 	int temp_reg = r600_get_temp(ctx);
   2787 	int treg[3] = {-1, -1, -1};
   2788 	struct r600_bytecode_alu alu;
   2789 	struct r600_bytecode_cf *cf_jump, *cf_pop;
   2790 
   2791 	/* only execute factor emission for invocation 0 */
   2792 	/* PRED_SETE_INT __, R0.x, 0 */
   2793 	memset(&alu, 0, sizeof(alu));
   2794 	alu.op = ALU_OP2_PRED_SETE_INT;
   2795 	alu.src[0].chan = 2;
   2796 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   2797 	alu.execute_mask = 1;
   2798 	alu.update_pred = 1;
   2799 	alu.last = 1;
   2800 	r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
   2801 
   2802 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
   2803 	cf_jump = ctx->bc->cf_last;
   2804 
   2805 	treg[0] = r600_get_temp(ctx);
   2806 	switch (ctx->shader->tcs_prim_mode) {
   2807 	case PIPE_PRIM_LINES:
   2808 		stride = 8; /* 2 dwords, 1 vec2 store */
   2809 		outer_comps = 2;
   2810 		inner_comps = 0;
   2811 		break;
   2812 	case PIPE_PRIM_TRIANGLES:
   2813 		stride = 16; /* 4 dwords, 1 vec4 store */
   2814 		outer_comps = 3;
   2815 		inner_comps = 1;
   2816 		treg[1] = r600_get_temp(ctx);
   2817 		break;
   2818 	case PIPE_PRIM_QUADS:
   2819 		stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
   2820 		outer_comps = 4;
   2821 		inner_comps = 2;
   2822 		treg[1] = r600_get_temp(ctx);
   2823 		treg[2] = r600_get_temp(ctx);
   2824 		break;
   2825 	default:
   2826 		assert(0);
   2827 		return -1;
   2828 	}
   2829 
   2830 	/* R0 is InvocationID, RelPatchID, PatchID, tf_base */
   2831 	/* TF_WRITE takes index in R.x, value in R.y */
   2832 	for (i = 0; i < ctx->shader->noutput; i++) {
   2833 		if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSINNER)
   2834 			tessinner_idx = i;
   2835 		if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSOUTER)
   2836 			tessouter_idx = i;
   2837 	}
   2838 
   2839 	if (tessouter_idx == -1)
   2840 		return -1;
   2841 
   2842 	if (tessinner_idx == -1 && inner_comps)
   2843 		return -1;
   2844 
   2845 	if (tessouter_idx != -1) {
   2846 		r = r600_tess_factor_read(ctx, tessouter_idx);
   2847 		if (r)
   2848 			return r;
   2849 	}
   2850 
   2851 	if (tessinner_idx != -1) {
   2852 		r = r600_tess_factor_read(ctx, tessinner_idx);
   2853 		if (r)
   2854 			return r;
   2855 	}
   2856 
   2857 	/* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
   2858 	/* r.x = relpatchid(r0.y) * tf_stride */
   2859 
   2860 	/* multiply incoming r0.y * stride - t.x = r0.y * stride */
   2861 	/* add incoming r0.w to it: t.x = t.x + r0.w */
   2862 	r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
   2863 			   temp_reg, 0,
   2864 			   0, 1,
   2865 			   V_SQ_ALU_SRC_LITERAL, stride,
   2866 			   0, 3);
   2867 	if (r)
   2868 		return r;
   2869 
   2870 	for (i = 0; i < outer_comps + inner_comps; i++) {
   2871 		int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
   2872 		int out_comp = i >= outer_comps ? i - outer_comps : i;
   2873 
   2874 		r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
   2875 				   treg[i / 2], (2 * (i % 2)),
   2876 				   temp_reg, 0,
   2877 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
   2878 		if (r)
   2879 			return r;
   2880 		r = single_alu_op2(ctx, ALU_OP1_MOV,
   2881 				   treg[i / 2], 1 + (2 * (i%2)),
   2882 				   ctx->shader->output[out_idx].gpr, out_comp,
   2883 				   0, 0);
   2884 		if (r)
   2885 			return r;
   2886 	}
   2887 	for (i = 0; i < outer_comps + inner_comps; i++) {
   2888 		struct r600_bytecode_gds gds;
   2889 
   2890 		memset(&gds, 0, sizeof(struct r600_bytecode_gds));
   2891 		gds.src_gpr = treg[i / 2];
   2892 		gds.src_sel_x = 2 * (i % 2);
   2893 		gds.src_sel_y = 1 + (2 * (i % 2));
   2894 		gds.src_sel_z = 4;
   2895 		gds.dst_sel_x = 7;
   2896 		gds.dst_sel_y = 7;
   2897 		gds.dst_sel_z = 7;
   2898 		gds.dst_sel_w = 7;
   2899 		gds.op = FETCH_OP_TF_WRITE;
   2900 		r = r600_bytecode_add_gds(ctx->bc, &gds);
   2901 		if (r)
   2902 			return r;
   2903 	}
   2904 
   2905 	// Patch up jump label
   2906 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
   2907 	cf_pop = ctx->bc->cf_last;
   2908 
   2909 	cf_jump->cf_addr = cf_pop->id + 2;
   2910 	cf_jump->pop_count = 1;
   2911 	cf_pop->cf_addr = cf_pop->id + 2;
   2912 	cf_pop->pop_count = 1;
   2913 
   2914 	return 0;
   2915 }
   2916 
   2917 static int r600_shader_from_tgsi(struct r600_context *rctx,
   2918 				 struct r600_pipe_shader *pipeshader,
   2919 				 union r600_shader_key key)
   2920 {
   2921 	struct r600_screen *rscreen = rctx->screen;
   2922 	struct r600_shader *shader = &pipeshader->shader;
   2923 	struct tgsi_token *tokens = pipeshader->selector->tokens;
   2924 	struct pipe_stream_output_info so = pipeshader->selector->so;
   2925 	struct tgsi_full_immediate *immediate;
   2926 	struct r600_shader_ctx ctx;
   2927 	struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
   2928 	unsigned output_done, noutput;
   2929 	unsigned opcode;
   2930 	int i, j, k, r = 0;
   2931 	int next_param_base = 0, next_clip_base;
   2932 	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
   2933 	bool indirect_gprs;
   2934 	bool ring_outputs = false;
   2935 	bool lds_outputs = false;
   2936 	bool lds_inputs = false;
   2937 	bool pos_emitted = false;
   2938 
   2939 	ctx.bc = &shader->bc;
   2940 	ctx.shader = shader;
   2941 	ctx.native_integers = true;
   2942 
   2943 	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
   2944 			   rscreen->has_compressed_msaa_texturing);
   2945 	ctx.tokens = tokens;
   2946 	tgsi_scan_shader(tokens, &ctx.info);
   2947 	shader->indirect_files = ctx.info.indirect_files;
   2948 
   2949 	shader->uses_doubles = ctx.info.uses_doubles;
   2950 
   2951 	indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
   2952 	tgsi_parse_init(&ctx.parse, tokens);
   2953 	ctx.type = ctx.info.processor;
   2954 	shader->processor_type = ctx.type;
   2955 	ctx.bc->type = shader->processor_type;
   2956 
   2957 	switch (ctx.type) {
   2958 	case PIPE_SHADER_VERTEX:
   2959 		shader->vs_as_gs_a = key.vs.as_gs_a;
   2960 		shader->vs_as_es = key.vs.as_es;
   2961 		shader->vs_as_ls = key.vs.as_ls;
   2962 		if (shader->vs_as_es)
   2963 			ring_outputs = true;
   2964 		if (shader->vs_as_ls)
   2965 			lds_outputs = true;
   2966 		break;
   2967 	case PIPE_SHADER_GEOMETRY:
   2968 		ring_outputs = true;
   2969 		break;
   2970 	case PIPE_SHADER_TESS_CTRL:
   2971 		shader->tcs_prim_mode = key.tcs.prim_mode;
   2972 		lds_outputs = true;
   2973 		lds_inputs = true;
   2974 		break;
   2975 	case PIPE_SHADER_TESS_EVAL:
   2976 		shader->tes_as_es = key.tes.as_es;
   2977 		lds_inputs = true;
   2978 		if (shader->tes_as_es)
   2979 			ring_outputs = true;
   2980 		break;
   2981 	case PIPE_SHADER_FRAGMENT:
   2982 		shader->two_side = key.ps.color_two_side;
   2983 		break;
   2984 	default:
   2985 		break;
   2986 	}
   2987 
   2988 	if (shader->vs_as_es || shader->tes_as_es) {
   2989 		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
   2990 	} else {
   2991 		ctx.gs_for_vs = NULL;
   2992 	}
   2993 
   2994 	ctx.next_ring_offset = 0;
   2995 	ctx.gs_out_ring_offset = 0;
   2996 	ctx.gs_next_vertex = 0;
   2997 	ctx.gs_stream_output_info = &so;
   2998 
   2999 	ctx.face_gpr = -1;
   3000 	ctx.fixed_pt_position_gpr = -1;
   3001 	ctx.fragcoord_input = -1;
   3002 	ctx.colors_used = 0;
   3003 	ctx.clip_vertex_write = 0;
   3004 
   3005 	shader->nr_ps_color_exports = 0;
   3006 	shader->nr_ps_max_color_exports = 0;
   3007 
   3008 
   3009 	/* register allocations */
   3010 	/* Values [0,127] correspond to GPR[0..127].
   3011 	 * Values [128,159] correspond to constant buffer bank 0
   3012 	 * Values [160,191] correspond to constant buffer bank 1
   3013 	 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
   3014 	 * Values [256,287] correspond to constant buffer bank 2 (EG)
   3015 	 * Values [288,319] correspond to constant buffer bank 3 (EG)
   3016 	 * Other special values are shown in the list below.
   3017 	 * 244  ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
   3018 	 * 245  ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
   3019 	 * 246  ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
   3020 	 * 247  ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
   3021 	 * 248	SQ_ALU_SRC_0: special constant 0.0.
   3022 	 * 249	SQ_ALU_SRC_1: special constant 1.0 float.
   3023 	 * 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
   3024 	 * 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
   3025 	 * 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
   3026 	 * 253	SQ_ALU_SRC_LITERAL: literal constant.
   3027 	 * 254	SQ_ALU_SRC_PV: previous vector result.
   3028 	 * 255	SQ_ALU_SRC_PS: previous scalar result.
   3029 	 */
   3030 	for (i = 0; i < TGSI_FILE_COUNT; i++) {
   3031 		ctx.file_offset[i] = 0;
   3032 	}
   3033 
   3034 	if (ctx.type == PIPE_SHADER_VERTEX) {
   3035 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
   3036 		r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
   3037 	}
   3038 	if (ctx.type == PIPE_SHADER_FRAGMENT) {
   3039 		if (ctx.bc->chip_class >= EVERGREEN)
   3040 			ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
   3041 		else
   3042 			ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
   3043 	}
   3044 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
   3045 		/* FIXME 1 would be enough in some cases (3 or less input vertices) */
   3046 		ctx.file_offset[TGSI_FILE_INPUT] = 2;
   3047 	}
   3048 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
   3049 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
   3050 	if (ctx.type == PIPE_SHADER_TESS_EVAL) {
   3051 		bool add_tesscoord = false, add_tess_inout = false;
   3052 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
   3053 		for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
   3054 			/* if we have tesscoord save one reg */
   3055 			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
   3056 				add_tesscoord = true;
   3057 			if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
   3058 			    ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
   3059 				add_tess_inout = true;
   3060 		}
   3061 		if (add_tesscoord || add_tess_inout)
   3062 			ctx.file_offset[TGSI_FILE_INPUT]++;
   3063 		if (add_tess_inout)
   3064 			ctx.file_offset[TGSI_FILE_INPUT]+=2;
   3065 	}
   3066 
   3067 	ctx.file_offset[TGSI_FILE_OUTPUT] =
   3068 			ctx.file_offset[TGSI_FILE_INPUT] +
   3069 			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
   3070 	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
   3071 						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
   3072 
   3073 	/* Outside the GPR range. This will be translated to one of the
   3074 	 * kcache banks later. */
   3075 	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
   3076 
   3077 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
   3078 	ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
   3079 			ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
   3080 	ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
   3081 	ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
   3082 
   3083 	if (ctx.type == PIPE_SHADER_TESS_CTRL) {
   3084 		ctx.tess_input_info = ctx.bc->ar_reg + 3;
   3085 		ctx.tess_output_info = ctx.bc->ar_reg + 4;
   3086 		ctx.temp_reg = ctx.bc->ar_reg + 5;
   3087 	} else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
   3088 		ctx.tess_input_info = 0;
   3089 		ctx.tess_output_info = ctx.bc->ar_reg + 3;
   3090 		ctx.temp_reg = ctx.bc->ar_reg + 4;
   3091 	} else if (ctx.type == PIPE_SHADER_GEOMETRY) {
   3092 		ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
   3093 		ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
   3094 		ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
   3095 		ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
   3096 		ctx.temp_reg = ctx.bc->ar_reg + 7;
   3097 	} else {
   3098 		ctx.temp_reg = ctx.bc->ar_reg + 3;
   3099 	}
   3100 
   3101 	shader->max_arrays = 0;
   3102 	shader->num_arrays = 0;
   3103 	if (indirect_gprs) {
   3104 
   3105 		if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
   3106 			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
   3107 			                   ctx.file_offset[TGSI_FILE_OUTPUT] -
   3108 			                   ctx.file_offset[TGSI_FILE_INPUT],
   3109 			                   0x0F);
   3110 		}
   3111 		if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
   3112 			r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
   3113 			                   ctx.file_offset[TGSI_FILE_TEMPORARY] -
   3114 			                   ctx.file_offset[TGSI_FILE_OUTPUT],
   3115 			                   0x0F);
   3116 		}
   3117 	}
   3118 
   3119 	ctx.nliterals = 0;
   3120 	ctx.literals = NULL;
   3121 
   3122 	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
   3123 			       ctx.info.colors_written == 1;
   3124 	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
   3125 	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
   3126 
   3127 	if (shader->vs_as_gs_a)
   3128 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
   3129 
   3130 	if (ctx.type == PIPE_SHADER_TESS_EVAL)
   3131 		r600_fetch_tess_io_info(&ctx);
   3132 
   3133 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
   3134 		tgsi_parse_token(&ctx.parse);
   3135 		switch (ctx.parse.FullToken.Token.Type) {
   3136 		case TGSI_TOKEN_TYPE_IMMEDIATE:
   3137 			immediate = &ctx.parse.FullToken.FullImmediate;
   3138 			ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
   3139 			if(ctx.literals == NULL) {
   3140 				r = -ENOMEM;
   3141 				goto out_err;
   3142 			}
   3143 			ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
   3144 			ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
   3145 			ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
   3146 			ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
   3147 			ctx.nliterals++;
   3148 			break;
   3149 		case TGSI_TOKEN_TYPE_DECLARATION:
   3150 			r = tgsi_declaration(&ctx);
   3151 			if (r)
   3152 				goto out_err;
   3153 			break;
   3154 		case TGSI_TOKEN_TYPE_INSTRUCTION:
   3155 		case TGSI_TOKEN_TYPE_PROPERTY:
   3156 			break;
   3157 		default:
   3158 			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
   3159 			r = -EINVAL;
   3160 			goto out_err;
   3161 		}
   3162 	}
   3163 
   3164 	shader->ring_item_sizes[0] = ctx.next_ring_offset;
   3165 	shader->ring_item_sizes[1] = 0;
   3166 	shader->ring_item_sizes[2] = 0;
   3167 	shader->ring_item_sizes[3] = 0;
   3168 
   3169 	/* Process two side if needed */
   3170 	if (shader->two_side && ctx.colors_used) {
   3171 		int i, count = ctx.shader->ninput;
   3172 		unsigned next_lds_loc = ctx.shader->nlds;
   3173 
   3174 		/* additional inputs will be allocated right after the existing inputs,
   3175 		 * we won't need them after the color selection, so we don't need to
   3176 		 * reserve these gprs for the rest of the shader code and to adjust
   3177 		 * output offsets etc. */
   3178 		int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
   3179 				ctx.info.file_max[TGSI_FILE_INPUT] + 1;
   3180 
   3181 		/* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
   3182 		if (ctx.face_gpr == -1) {
   3183 			i = ctx.shader->ninput++;
   3184 			ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
   3185 			ctx.shader->input[i].spi_sid = 0;
   3186 			ctx.shader->input[i].gpr = gpr++;
   3187 			ctx.face_gpr = ctx.shader->input[i].gpr;
   3188 		}
   3189 
   3190 		for (i = 0; i < count; i++) {
   3191 			if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
   3192 				int ni = ctx.shader->ninput++;
   3193 				memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
   3194 				ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
   3195 				ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
   3196 				ctx.shader->input[ni].gpr = gpr++;
   3197 				// TGSI to LLVM needs to know the lds position of inputs.
   3198 				// Non LLVM path computes it later (in process_twoside_color)
   3199 				ctx.shader->input[ni].lds_pos = next_lds_loc++;
   3200 				ctx.shader->input[i].back_color_input = ni;
   3201 				if (ctx.bc->chip_class >= EVERGREEN) {
   3202 					if ((r = evergreen_interp_input(&ctx, ni)))
   3203 						return r;
   3204 				}
   3205 			}
   3206 		}
   3207 	}
   3208 
   3209 	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
   3210 		shader->nr_ps_max_color_exports = 8;
   3211 
   3212 	if (ctx.fragcoord_input >= 0) {
   3213 		if (ctx.bc->chip_class == CAYMAN) {
   3214 			for (j = 0 ; j < 4; j++) {
   3215 				struct r600_bytecode_alu alu;
   3216 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3217 				alu.op = ALU_OP1_RECIP_IEEE;
   3218 				alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
   3219 				alu.src[0].chan = 3;
   3220 
   3221 				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
   3222 				alu.dst.chan = j;
   3223 				alu.dst.write = (j == 3);
   3224 				alu.last = 1;
   3225 				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
   3226 					return r;
   3227 			}
   3228 		} else {
   3229 			struct r600_bytecode_alu alu;
   3230 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3231 			alu.op = ALU_OP1_RECIP_IEEE;
   3232 			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
   3233 			alu.src[0].chan = 3;
   3234 
   3235 			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
   3236 			alu.dst.chan = 3;
   3237 			alu.dst.write = 1;
   3238 			alu.last = 1;
   3239 			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
   3240 				return r;
   3241 		}
   3242 	}
   3243 
   3244 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
   3245 		struct r600_bytecode_alu alu;
   3246 		int r;
   3247 
   3248 		/* GS thread with no output workaround - emit a cut at start of GS */
   3249 		if (ctx.bc->chip_class == R600)
   3250 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
   3251 
   3252 		for (j = 0; j < 4; j++) {
   3253 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3254 			alu.op = ALU_OP1_MOV;
   3255 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
   3256 			alu.src[0].value = 0;
   3257 			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
   3258 			alu.dst.write = 1;
   3259 			alu.last = 1;
   3260 			r = r600_bytecode_add_alu(ctx.bc, &alu);
   3261 			if (r)
   3262 				return r;
   3263 		}
   3264 	}
   3265 
   3266 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
   3267 		r600_fetch_tess_io_info(&ctx);
   3268 
   3269 	if (shader->two_side && ctx.colors_used) {
   3270 		if ((r = process_twoside_color_inputs(&ctx)))
   3271 			return r;
   3272 	}
   3273 
   3274 	tgsi_parse_init(&ctx.parse, tokens);
   3275 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
   3276 		tgsi_parse_token(&ctx.parse);
   3277 		switch (ctx.parse.FullToken.Token.Type) {
   3278 		case TGSI_TOKEN_TYPE_INSTRUCTION:
   3279 			r = tgsi_is_supported(&ctx);
   3280 			if (r)
   3281 				goto out_err;
   3282 			ctx.max_driver_temp_used = 0;
   3283 			/* reserve first tmp for everyone */
   3284 			r600_get_temp(&ctx);
   3285 
   3286 			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
   3287 			if ((r = tgsi_split_constant(&ctx)))
   3288 				goto out_err;
   3289 			if ((r = tgsi_split_literal_constant(&ctx)))
   3290 				goto out_err;
   3291 			if (ctx.type == PIPE_SHADER_GEOMETRY) {
   3292 				if ((r = tgsi_split_gs_inputs(&ctx)))
   3293 					goto out_err;
   3294 			} else if (lds_inputs) {
   3295 				if ((r = tgsi_split_lds_inputs(&ctx)))
   3296 					goto out_err;
   3297 			}
   3298 			if (ctx.bc->chip_class == CAYMAN)
   3299 				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
   3300 			else if (ctx.bc->chip_class >= EVERGREEN)
   3301 				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
   3302 			else
   3303 				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
   3304 			r = ctx.inst_info->process(&ctx);
   3305 			if (r)
   3306 				goto out_err;
   3307 
   3308 			if (ctx.type == PIPE_SHADER_TESS_CTRL) {
   3309 				r = r600_store_tcs_output(&ctx);
   3310 				if (r)
   3311 					goto out_err;
   3312 			}
   3313 			break;
   3314 		default:
   3315 			break;
   3316 		}
   3317 	}
   3318 
   3319 	/* Reset the temporary register counter. */
   3320 	ctx.max_driver_temp_used = 0;
   3321 
   3322 	noutput = shader->noutput;
   3323 
   3324 	if (!ring_outputs && ctx.clip_vertex_write) {
   3325 		unsigned clipdist_temp[2];
   3326 
   3327 		clipdist_temp[0] = r600_get_temp(&ctx);
   3328 		clipdist_temp[1] = r600_get_temp(&ctx);
   3329 
   3330 		/* need to convert a clipvertex write into clipdistance writes and not export
   3331 		   the clip vertex anymore */
   3332 
   3333 		memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
   3334 		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
   3335 		shader->output[noutput].gpr = clipdist_temp[0];
   3336 		noutput++;
   3337 		shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
   3338 		shader->output[noutput].gpr = clipdist_temp[1];
   3339 		noutput++;
   3340 
   3341 		/* reset spi_sid for clipvertex output to avoid confusing spi */
   3342 		shader->output[ctx.cv_output].spi_sid = 0;
   3343 
   3344 		shader->clip_dist_write = 0xFF;
   3345 
   3346 		for (i = 0; i < 8; i++) {
   3347 			int oreg = i >> 2;
   3348 			int ochan = i & 3;
   3349 
   3350 			for (j = 0; j < 4; j++) {
   3351 				struct r600_bytecode_alu alu;
   3352 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3353 				alu.op = ALU_OP2_DOT4;
   3354 				alu.src[0].sel = shader->output[ctx.cv_output].gpr;
   3355 				alu.src[0].chan = j;
   3356 
   3357 				alu.src[1].sel = 512 + i;
   3358 				alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
   3359 				alu.src[1].chan = j;
   3360 
   3361 				alu.dst.sel = clipdist_temp[oreg];
   3362 				alu.dst.chan = j;
   3363 				alu.dst.write = (j == ochan);
   3364 				if (j == 3)
   3365 					alu.last = 1;
   3366 				r = r600_bytecode_add_alu(ctx.bc, &alu);
   3367 				if (r)
   3368 					return r;
   3369 			}
   3370 		}
   3371 	}
   3372 
   3373 	/* Add stream outputs. */
   3374 	if (so.num_outputs) {
   3375 		bool emit = false;
   3376 		if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
   3377 			emit = true;
   3378 		if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
   3379 			emit = true;
   3380 		if (emit)
   3381 			emit_streamout(&ctx, &so, -1, NULL);
   3382 	}
   3383 	pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
   3384 	convert_edgeflag_to_int(&ctx);
   3385 
   3386 	if (ctx.type == PIPE_SHADER_TESS_CTRL)
   3387 		r600_emit_tess_factor(&ctx);
   3388 
   3389 	if (lds_outputs) {
   3390 		if (ctx.type == PIPE_SHADER_VERTEX) {
   3391 			if (ctx.shader->noutput)
   3392 				emit_lds_vs_writes(&ctx);
   3393 		}
   3394 	} else if (ring_outputs) {
   3395 		if (shader->vs_as_es || shader->tes_as_es) {
   3396 			ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
   3397 			ctx.gs_export_gpr_tregs[1] = -1;
   3398 			ctx.gs_export_gpr_tregs[2] = -1;
   3399 			ctx.gs_export_gpr_tregs[3] = -1;
   3400 
   3401 			emit_gs_ring_writes(&ctx, &so, -1, FALSE);
   3402 		}
   3403 	} else {
   3404 		/* Export output */
   3405 		next_clip_base = shader->vs_out_misc_write ? 62 : 61;
   3406 
   3407 		for (i = 0, j = 0; i < noutput; i++, j++) {
   3408 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   3409 			output[j].gpr = shader->output[i].gpr;
   3410 			output[j].elem_size = 3;
   3411 			output[j].swizzle_x = 0;
   3412 			output[j].swizzle_y = 1;
   3413 			output[j].swizzle_z = 2;
   3414 			output[j].swizzle_w = 3;
   3415 			output[j].burst_count = 1;
   3416 			output[j].type = -1;
   3417 			output[j].op = CF_OP_EXPORT;
   3418 			switch (ctx.type) {
   3419 			case PIPE_SHADER_VERTEX:
   3420 			case PIPE_SHADER_TESS_EVAL:
   3421 				switch (shader->output[i].name) {
   3422 				case TGSI_SEMANTIC_POSITION:
   3423 					output[j].array_base = 60;
   3424 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   3425 					pos_emitted = true;
   3426 					break;
   3427 
   3428 				case TGSI_SEMANTIC_PSIZE:
   3429 					output[j].array_base = 61;
   3430 					output[j].swizzle_y = 7;
   3431 					output[j].swizzle_z = 7;
   3432 					output[j].swizzle_w = 7;
   3433 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   3434 					pos_emitted = true;
   3435 					break;
   3436 				case TGSI_SEMANTIC_EDGEFLAG:
   3437 					output[j].array_base = 61;
   3438 					output[j].swizzle_x = 7;
   3439 					output[j].swizzle_y = 0;
   3440 					output[j].swizzle_z = 7;
   3441 					output[j].swizzle_w = 7;
   3442 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   3443 					pos_emitted = true;
   3444 					break;
   3445 				case TGSI_SEMANTIC_LAYER:
   3446 					/* spi_sid is 0 for outputs that are
   3447 					 * not consumed by PS */
   3448 					if (shader->output[i].spi_sid) {
   3449 						output[j].array_base = next_param_base++;
   3450 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   3451 						j++;
   3452 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
   3453 					}
   3454 					output[j].array_base = 61;
   3455 					output[j].swizzle_x = 7;
   3456 					output[j].swizzle_y = 7;
   3457 					output[j].swizzle_z = 0;
   3458 					output[j].swizzle_w = 7;
   3459 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   3460 					pos_emitted = true;
   3461 					break;
   3462 				case TGSI_SEMANTIC_VIEWPORT_INDEX:
   3463 					/* spi_sid is 0 for outputs that are
   3464 					 * not consumed by PS */
   3465 					if (shader->output[i].spi_sid) {
   3466 						output[j].array_base = next_param_base++;
   3467 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   3468 						j++;
   3469 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
   3470 					}
   3471 					output[j].array_base = 61;
   3472 					output[j].swizzle_x = 7;
   3473 					output[j].swizzle_y = 7;
   3474 					output[j].swizzle_z = 7;
   3475 					output[j].swizzle_w = 0;
   3476 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   3477 					pos_emitted = true;
   3478 					break;
   3479 				case TGSI_SEMANTIC_CLIPVERTEX:
   3480 					j--;
   3481 					break;
   3482 				case TGSI_SEMANTIC_CLIPDIST:
   3483 					output[j].array_base = next_clip_base++;
   3484 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   3485 					pos_emitted = true;
   3486 					/* spi_sid is 0 for clipdistance outputs that were generated
   3487 					 * for clipvertex - we don't need to pass them to PS */
   3488 					if (shader->output[i].spi_sid) {
   3489 						j++;
   3490 						/* duplicate it as PARAM to pass to the pixel shader */
   3491 						memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
   3492 						output[j].array_base = next_param_base++;
   3493 						output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   3494 					}
   3495 					break;
   3496 				case TGSI_SEMANTIC_FOG:
   3497 					output[j].swizzle_y = 4; /* 0 */
   3498 					output[j].swizzle_z = 4; /* 0 */
   3499 					output[j].swizzle_w = 5; /* 1 */
   3500 					break;
   3501 				case TGSI_SEMANTIC_PRIMID:
   3502 					output[j].swizzle_x = 2;
   3503 					output[j].swizzle_y = 4; /* 0 */
   3504 					output[j].swizzle_z = 4; /* 0 */
   3505 					output[j].swizzle_w = 4; /* 0 */
   3506 					break;
   3507 				}
   3508 
   3509 				break;
   3510 			case PIPE_SHADER_FRAGMENT:
   3511 				if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
   3512 					/* never export more colors than the number of CBs */
   3513 					if (shader->output[i].sid >= max_color_exports) {
   3514 						/* skip export */
   3515 						j--;
   3516 						continue;
   3517 					}
   3518 					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
   3519 					output[j].array_base = shader->output[i].sid;
   3520 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   3521 					shader->nr_ps_color_exports++;
   3522 					if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
   3523 						for (k = 1; k < max_color_exports; k++) {
   3524 							j++;
   3525 							memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   3526 							output[j].gpr = shader->output[i].gpr;
   3527 							output[j].elem_size = 3;
   3528 							output[j].swizzle_x = 0;
   3529 							output[j].swizzle_y = 1;
   3530 							output[j].swizzle_z = 2;
   3531 							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
   3532 							output[j].burst_count = 1;
   3533 							output[j].array_base = k;
   3534 							output[j].op = CF_OP_EXPORT;
   3535 							output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   3536 							shader->nr_ps_color_exports++;
   3537 						}
   3538 					}
   3539 				} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
   3540 					output[j].array_base = 61;
   3541 					output[j].swizzle_x = 2;
   3542 					output[j].swizzle_y = 7;
   3543 					output[j].swizzle_z = output[j].swizzle_w = 7;
   3544 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   3545 				} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
   3546 					output[j].array_base = 61;
   3547 					output[j].swizzle_x = 7;
   3548 					output[j].swizzle_y = 1;
   3549 					output[j].swizzle_z = output[j].swizzle_w = 7;
   3550 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   3551 				} else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
   3552 					output[j].array_base = 61;
   3553 					output[j].swizzle_x = 7;
   3554 					output[j].swizzle_y = 7;
   3555 					output[j].swizzle_z = 0;
   3556 					output[j].swizzle_w = 7;
   3557 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   3558 				} else {
   3559 					R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
   3560 					r = -EINVAL;
   3561 					goto out_err;
   3562 				}
   3563 				break;
   3564 			case PIPE_SHADER_TESS_CTRL:
   3565 				break;
   3566 			default:
   3567 				R600_ERR("unsupported processor type %d\n", ctx.type);
   3568 				r = -EINVAL;
   3569 				goto out_err;
   3570 			}
   3571 
   3572 			if (output[j].type==-1) {
   3573 				output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   3574 				output[j].array_base = next_param_base++;
   3575 			}
   3576 		}
   3577 
   3578 		/* add fake position export */
   3579 		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
   3580 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   3581 			output[j].gpr = 0;
   3582 			output[j].elem_size = 3;
   3583 			output[j].swizzle_x = 7;
   3584 			output[j].swizzle_y = 7;
   3585 			output[j].swizzle_z = 7;
   3586 			output[j].swizzle_w = 7;
   3587 			output[j].burst_count = 1;
   3588 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
   3589 			output[j].array_base = 60;
   3590 			output[j].op = CF_OP_EXPORT;
   3591 			j++;
   3592 		}
   3593 
   3594 		/* add fake param output for vertex shader if no param is exported */
   3595 		if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
   3596 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   3597 			output[j].gpr = 0;
   3598 			output[j].elem_size = 3;
   3599 			output[j].swizzle_x = 7;
   3600 			output[j].swizzle_y = 7;
   3601 			output[j].swizzle_z = 7;
   3602 			output[j].swizzle_w = 7;
   3603 			output[j].burst_count = 1;
   3604 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
   3605 			output[j].array_base = 0;
   3606 			output[j].op = CF_OP_EXPORT;
   3607 			j++;
   3608 		}
   3609 
   3610 		/* add fake pixel export */
   3611 		if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
   3612 			memset(&output[j], 0, sizeof(struct r600_bytecode_output));
   3613 			output[j].gpr = 0;
   3614 			output[j].elem_size = 3;
   3615 			output[j].swizzle_x = 7;
   3616 			output[j].swizzle_y = 7;
   3617 			output[j].swizzle_z = 7;
   3618 			output[j].swizzle_w = 7;
   3619 			output[j].burst_count = 1;
   3620 			output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
   3621 			output[j].array_base = 0;
   3622 			output[j].op = CF_OP_EXPORT;
   3623 			j++;
   3624 			shader->nr_ps_color_exports++;
   3625 		}
   3626 
   3627 		noutput = j;
   3628 
   3629 		/* set export done on last export of each type */
   3630 		for (i = noutput - 1, output_done = 0; i >= 0; i--) {
   3631 			if (!(output_done & (1 << output[i].type))) {
   3632 				output_done |= (1 << output[i].type);
   3633 				output[i].op = CF_OP_EXPORT_DONE;
   3634 			}
   3635 		}
   3636 		/* add output to bytecode */
   3637 		for (i = 0; i < noutput; i++) {
   3638 			r = r600_bytecode_add_output(ctx.bc, &output[i]);
   3639 			if (r)
   3640 				goto out_err;
   3641 		}
   3642 	}
   3643 
   3644 	/* add program end */
   3645 	if (ctx.bc->chip_class == CAYMAN)
   3646 		cm_bytecode_add_cf_end(ctx.bc);
   3647 	else {
   3648 		const struct cf_op_info *last = NULL;
   3649 
   3650 		if (ctx.bc->cf_last)
   3651 			last = r600_isa_cf(ctx.bc->cf_last->op);
   3652 
   3653 		/* alu clause instructions don't have EOP bit, so add NOP */
   3654 		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
   3655 			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
   3656 
   3657 		ctx.bc->cf_last->end_of_program = 1;
   3658 	}
   3659 
   3660 	/* check GPR limit - we have 124 = 128 - 4
   3661 	 * (4 are reserved as alu clause temporary registers) */
   3662 	if (ctx.bc->ngpr > 124) {
   3663 		R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
   3664 		r = -ENOMEM;
   3665 		goto out_err;
   3666 	}
   3667 
   3668 	if (ctx.type == PIPE_SHADER_GEOMETRY) {
   3669 		if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
   3670 			return r;
   3671 	}
   3672 
   3673 	free(ctx.literals);
   3674 	tgsi_parse_free(&ctx.parse);
   3675 	return 0;
   3676 out_err:
   3677 	free(ctx.literals);
   3678 	tgsi_parse_free(&ctx.parse);
   3679 	return r;
   3680 }
   3681 
   3682 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
   3683 {
   3684 	const unsigned tgsi_opcode =
   3685 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
   3686 	R600_ERR("%s tgsi opcode unsupported\n",
   3687 		 tgsi_get_opcode_name(tgsi_opcode));
   3688 	return -EINVAL;
   3689 }
   3690 
   3691 static int tgsi_end(struct r600_shader_ctx *ctx)
   3692 {
   3693 	return 0;
   3694 }
   3695 
   3696 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
   3697 			const struct r600_shader_src *shader_src,
   3698 			unsigned chan)
   3699 {
   3700 	bc_src->sel = shader_src->sel;
   3701 	bc_src->chan = shader_src->swizzle[chan];
   3702 	bc_src->neg = shader_src->neg;
   3703 	bc_src->abs = shader_src->abs;
   3704 	bc_src->rel = shader_src->rel;
   3705 	bc_src->value = shader_src->value[bc_src->chan];
   3706 	bc_src->kc_bank = shader_src->kc_bank;
   3707 	bc_src->kc_rel = shader_src->kc_rel;
   3708 }
   3709 
   3710 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
   3711 {
   3712 	bc_src->abs = 1;
   3713 	bc_src->neg = 0;
   3714 }
   3715 
   3716 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
   3717 {
   3718 	bc_src->neg = !bc_src->neg;
   3719 }
   3720 
   3721 static void tgsi_dst(struct r600_shader_ctx *ctx,
   3722 		     const struct tgsi_full_dst_register *tgsi_dst,
   3723 		     unsigned swizzle,
   3724 		     struct r600_bytecode_alu_dst *r600_dst)
   3725 {
   3726 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3727 
   3728 	r600_dst->sel = tgsi_dst->Register.Index;
   3729 	r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
   3730 	r600_dst->chan = swizzle;
   3731 	r600_dst->write = 1;
   3732 	if (inst->Instruction.Saturate) {
   3733 		r600_dst->clamp = 1;
   3734 	}
   3735 	if (ctx->type == PIPE_SHADER_TESS_CTRL) {
   3736 		if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
   3737 			return;
   3738 		}
   3739 	}
   3740 	if (tgsi_dst->Register.Indirect)
   3741 		r600_dst->rel = V_SQ_REL_RELATIVE;
   3742 
   3743 }
   3744 
   3745 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
   3746 {
   3747 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3748 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   3749 	struct r600_bytecode_alu alu;
   3750 	int i, j, r, lasti = tgsi_last_instruction(write_mask);
   3751 	int use_tmp = 0;
   3752 
   3753 	if (singledest) {
   3754 		switch (write_mask) {
   3755 		case 0x1:
   3756 			write_mask = 0x3;
   3757 			break;
   3758 		case 0x2:
   3759 			use_tmp = 1;
   3760 			write_mask = 0x3;
   3761 			break;
   3762 		case 0x4:
   3763 			write_mask = 0xc;
   3764 			break;
   3765 		case 0x8:
   3766 			write_mask = 0xc;
   3767 			use_tmp = 3;
   3768 			break;
   3769 		}
   3770 	}
   3771 
   3772 	lasti = tgsi_last_instruction(write_mask);
   3773 	for (i = 0; i <= lasti; i++) {
   3774 
   3775 		if (!(write_mask & (1 << i)))
   3776 			continue;
   3777 
   3778 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3779 
   3780 		if (singledest) {
   3781 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3782 			if (use_tmp) {
   3783 				alu.dst.sel = ctx->temp_reg;
   3784 				alu.dst.chan = i;
   3785 				alu.dst.write = 1;
   3786 			}
   3787 			if (i == 1 || i == 3)
   3788 				alu.dst.write = 0;
   3789 		} else
   3790 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3791 
   3792 		alu.op = ctx->inst_info->op;
   3793 		if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
   3794 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   3795 		} else if (!swap) {
   3796 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   3797 				r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
   3798 			}
   3799 		} else {
   3800 			r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
   3801 			r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
   3802 		}
   3803 
   3804 		/* handle some special cases */
   3805 		if (i == 1 || i == 3) {
   3806 			switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
   3807 			case TGSI_OPCODE_DABS:
   3808 				r600_bytecode_src_set_abs(&alu.src[0]);
   3809 				break;
   3810 			default:
   3811 				break;
   3812 			}
   3813 		}
   3814 		if (i == lasti) {
   3815 			alu.last = 1;
   3816 		}
   3817 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3818 		if (r)
   3819 			return r;
   3820 	}
   3821 
   3822 	if (use_tmp) {
   3823 		write_mask = inst->Dst[0].Register.WriteMask;
   3824 
   3825 		/* move result from temp to dst */
   3826 		for (i = 0; i <= lasti; i++) {
   3827 			if (!(write_mask & (1 << i)))
   3828 				continue;
   3829 
   3830 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3831 			alu.op = ALU_OP1_MOV;
   3832 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3833 			alu.src[0].sel = ctx->temp_reg;
   3834 			alu.src[0].chan = use_tmp - 1;
   3835 			alu.last = (i == lasti);
   3836 
   3837 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   3838 			if (r)
   3839 				return r;
   3840 		}
   3841 	}
   3842 	return 0;
   3843 }
   3844 
   3845 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
   3846 {
   3847 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3848 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   3849 	/* confirm writemasking */
   3850 	if ((write_mask & 0x3) != 0x3 &&
   3851 	    (write_mask & 0xc) != 0xc) {
   3852 		fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
   3853 		return -1;
   3854 	}
   3855 	return tgsi_op2_64_params(ctx, false, false);
   3856 }
   3857 
   3858 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
   3859 {
   3860 	return tgsi_op2_64_params(ctx, true, false);
   3861 }
   3862 
   3863 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
   3864 {
   3865 	return tgsi_op2_64_params(ctx, true, true);
   3866 }
   3867 
   3868 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
   3869 {
   3870 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3871 	struct r600_bytecode_alu alu;
   3872 	int i, j, r;
   3873 	int lasti = 3;
   3874 	int tmp = r600_get_temp(ctx);
   3875 
   3876 	for (i = 0; i < lasti + 1; i++) {
   3877 
   3878 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3879 		alu.op = ctx->inst_info->op;
   3880 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   3881 			r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
   3882 		}
   3883 
   3884 		if (inst->Dst[0].Register.WriteMask & (1 << i))
   3885 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3886 		else
   3887 			alu.dst.sel = tmp;
   3888 
   3889 		alu.dst.chan = i;
   3890 		alu.is_op3 = 1;
   3891 		if (i == lasti) {
   3892 			alu.last = 1;
   3893 		}
   3894 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3895 		if (r)
   3896 			return r;
   3897 	}
   3898 	return 0;
   3899 }
   3900 
   3901 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
   3902 {
   3903 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3904 	struct r600_bytecode_alu alu;
   3905 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   3906 	int i, j, r, lasti = tgsi_last_instruction(write_mask);
   3907 	/* use temp register if trans_only and more than one dst component */
   3908 	int use_tmp = trans_only && (write_mask ^ (1 << lasti));
   3909 
   3910 	for (i = 0; i <= lasti; i++) {
   3911 		if (!(write_mask & (1 << i)))
   3912 			continue;
   3913 
   3914 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3915 		if (use_tmp) {
   3916 			alu.dst.sel = ctx->temp_reg;
   3917 			alu.dst.chan = i;
   3918 			alu.dst.write = 1;
   3919 		} else
   3920 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3921 
   3922 		alu.op = ctx->inst_info->op;
   3923 		if (!swap) {
   3924 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   3925 				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
   3926 			}
   3927 		} else {
   3928 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   3929 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   3930 		}
   3931 		if (i == lasti || trans_only) {
   3932 			alu.last = 1;
   3933 		}
   3934 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3935 		if (r)
   3936 			return r;
   3937 	}
   3938 
   3939 	if (use_tmp) {
   3940 		/* move result from temp to dst */
   3941 		for (i = 0; i <= lasti; i++) {
   3942 			if (!(write_mask & (1 << i)))
   3943 				continue;
   3944 
   3945 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3946 			alu.op = ALU_OP1_MOV;
   3947 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3948 			alu.src[0].sel = ctx->temp_reg;
   3949 			alu.src[0].chan = i;
   3950 			alu.last = (i == lasti);
   3951 
   3952 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   3953 			if (r)
   3954 				return r;
   3955 		}
   3956 	}
   3957 	return 0;
   3958 }
   3959 
   3960 static int tgsi_op2(struct r600_shader_ctx *ctx)
   3961 {
   3962 	return tgsi_op2_s(ctx, 0, 0);
   3963 }
   3964 
   3965 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
   3966 {
   3967 	return tgsi_op2_s(ctx, 1, 0);
   3968 }
   3969 
   3970 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
   3971 {
   3972 	return tgsi_op2_s(ctx, 0, 1);
   3973 }
   3974 
   3975 static int tgsi_ineg(struct r600_shader_ctx *ctx)
   3976 {
   3977 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   3978 	struct r600_bytecode_alu alu;
   3979 	int i, r;
   3980 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   3981 
   3982 	for (i = 0; i < lasti + 1; i++) {
   3983 
   3984 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   3985 			continue;
   3986 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   3987 		alu.op = ctx->inst_info->op;
   3988 
   3989 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   3990 
   3991 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   3992 
   3993 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   3994 
   3995 		if (i == lasti) {
   3996 			alu.last = 1;
   3997 		}
   3998 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   3999 		if (r)
   4000 			return r;
   4001 	}
   4002 	return 0;
   4003 
   4004 }
   4005 
   4006 static int tgsi_dneg(struct r600_shader_ctx *ctx)
   4007 {
   4008 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4009 	struct r600_bytecode_alu alu;
   4010 	int i, r;
   4011 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4012 
   4013 	for (i = 0; i < lasti + 1; i++) {
   4014 
   4015 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4016 			continue;
   4017 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4018 		alu.op = ALU_OP1_MOV;
   4019 
   4020 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   4021 
   4022 		if (i == 1 || i == 3)
   4023 			r600_bytecode_src_toggle_neg(&alu.src[0]);
   4024 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4025 
   4026 		if (i == lasti) {
   4027 			alu.last = 1;
   4028 		}
   4029 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4030 		if (r)
   4031 			return r;
   4032 	}
   4033 	return 0;
   4034 
   4035 }
   4036 
   4037 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
   4038 {
   4039 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4040 	struct r600_bytecode_alu alu;
   4041 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   4042 	int i, j, r;
   4043 	int firsti = write_mask == 0xc ? 2 : 0;
   4044 
   4045 	for (i = 0; i <= 3; i++) {
   4046 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4047 		alu.op = ctx->inst_info->op;
   4048 
   4049 		alu.dst.sel = ctx->temp_reg;
   4050 		alu.dst.chan = i;
   4051 		alu.dst.write = 1;
   4052 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   4053 			r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
   4054 		}
   4055 
   4056 		if (i == 3)
   4057 			alu.last = 1;
   4058 
   4059 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4060 		if (r)
   4061 			return r;
   4062 	}
   4063 
   4064 	/* MOV first two channels to writemask dst0 */
   4065 	for (i = 0; i <= 1; i++) {
   4066 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4067 		alu.op = ALU_OP1_MOV;
   4068 		alu.src[0].chan = i + 2;
   4069 		alu.src[0].sel = ctx->temp_reg;
   4070 
   4071 		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
   4072 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
   4073 		alu.last = 1;
   4074 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4075 		if (r)
   4076 			return r;
   4077 	}
   4078 
   4079 	for (i = 0; i <= 3; i++) {
   4080 		if (inst->Dst[1].Register.WriteMask & (1 << i)) {
   4081 			/* MOV third channels to writemask dst1 */
   4082 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4083 			alu.op = ALU_OP1_MOV;
   4084 			alu.src[0].chan = 1;
   4085 			alu.src[0].sel = ctx->temp_reg;
   4086 
   4087 			tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
   4088 			alu.last = 1;
   4089 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4090 			if (r)
   4091 				return r;
   4092 			break;
   4093 		}
   4094 	}
   4095 	return 0;
   4096 }
   4097 
   4098 
   4099 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
   4100 {
   4101 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4102 	struct r600_bytecode_alu alu;
   4103 	int i, r;
   4104 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4105 
   4106 	assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
   4107 		inst->Instruction.Opcode == TGSI_OPCODE_U2D);
   4108 
   4109 	for (i = 0; i <= (lasti+1)/2; i++) {
   4110 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4111 		alu.op = ctx->inst_info->op;
   4112 
   4113 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   4114 		alu.dst.sel = ctx->temp_reg;
   4115 		alu.dst.chan = i;
   4116 		alu.dst.write = 1;
   4117 		alu.last = 1;
   4118 
   4119 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4120 		if (r)
   4121 			return r;
   4122 	}
   4123 
   4124 	for (i = 0; i <= lasti; i++) {
   4125 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4126 		alu.op = ALU_OP1_FLT32_TO_FLT64;
   4127 
   4128 		alu.src[0].chan = i/2;
   4129 		if (i%2 == 0)
   4130 			alu.src[0].sel = ctx->temp_reg;
   4131 		else {
   4132 			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
   4133 			alu.src[0].value = 0x0;
   4134 		}
   4135 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4136 		alu.last = i == lasti;
   4137 
   4138 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4139 		if (r)
   4140 			return r;
   4141 	}
   4142 
   4143 	return 0;
   4144 }
   4145 
   4146 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
   4147 {
   4148 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4149 	struct r600_bytecode_alu alu;
   4150 	int i, r;
   4151 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4152 
   4153 	assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
   4154 		inst->Instruction.Opcode == TGSI_OPCODE_D2U);
   4155 
   4156 	for (i = 0; i <= lasti; i++) {
   4157 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4158 		alu.op = ALU_OP1_FLT64_TO_FLT32;
   4159 
   4160 		r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
   4161 		alu.dst.chan = i;
   4162 		alu.dst.sel = ctx->temp_reg;
   4163 		alu.dst.write = i%2 == 0;
   4164 		alu.last = i == lasti;
   4165 
   4166 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4167 		if (r)
   4168 			return r;
   4169 	}
   4170 
   4171 	for (i = 0; i <= (lasti+1)/2; i++) {
   4172 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4173 		alu.op = ctx->inst_info->op;
   4174 
   4175 		alu.src[0].chan = i*2;
   4176 		alu.src[0].sel = ctx->temp_reg;
   4177 		tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
   4178 		alu.last = 1;
   4179 
   4180 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4181 		if (r)
   4182 			return r;
   4183 	}
   4184 
   4185 	return 0;
   4186 }
   4187 
   4188 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
   4189 					unsigned op,
   4190 					int dst_reg,
   4191 					struct r600_shader_src *src,
   4192 					bool abs)
   4193 {
   4194 	struct r600_bytecode_alu alu;
   4195 	const int last_slot = 3;
   4196 	int r;
   4197 
   4198 	/* these have to write the result to X/Y by the looks of it */
   4199 	for (int i = 0 ; i < last_slot; i++) {
   4200 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4201 		alu.op = op;
   4202 
   4203 		r600_bytecode_src(&alu.src[0], src, 1);
   4204 		r600_bytecode_src(&alu.src[1], src, 0);
   4205 
   4206 		if (abs)
   4207 			r600_bytecode_src_set_abs(&alu.src[1]);
   4208 
   4209 		alu.dst.sel = dst_reg;
   4210 		alu.dst.chan = i;
   4211 		alu.dst.write = (i == 0 || i == 1);
   4212 
   4213 		if (bc->chip_class != CAYMAN || i == last_slot - 1)
   4214 			alu.last = 1;
   4215 		r = r600_bytecode_add_alu(bc, &alu);
   4216 		if (r)
   4217 			return r;
   4218 	}
   4219 
   4220 	return 0;
   4221 }
   4222 
   4223 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
   4224 {
   4225 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4226 	int i, r;
   4227 	struct r600_bytecode_alu alu;
   4228 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4229 	int t1 = ctx->temp_reg;
   4230 
   4231 	/* should only be one src regs */
   4232 	assert(inst->Instruction.NumSrcRegs == 1);
   4233 
   4234 	/* only support one double at a time */
   4235 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
   4236 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
   4237 
   4238 	r = cayman_emit_unary_double_raw(
   4239 		ctx->bc, ctx->inst_info->op, t1,
   4240 		&ctx->src[0],
   4241 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
   4242 		ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
   4243 	if (r)
   4244 		return r;
   4245 
   4246 	for (i = 0 ; i <= lasti; i++) {
   4247 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4248 			continue;
   4249 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4250 		alu.op = ALU_OP1_MOV;
   4251 		alu.src[0].sel = t1;
   4252 		alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
   4253 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4254 		alu.dst.write = 1;
   4255 		if (i == lasti)
   4256 			alu.last = 1;
   4257 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4258 		if (r)
   4259 			return r;
   4260 	}
   4261 	return 0;
   4262 }
   4263 
   4264 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
   4265 {
   4266 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4267 	int i, j, r;
   4268 	struct r600_bytecode_alu alu;
   4269 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
   4270 
   4271 	for (i = 0 ; i < last_slot; i++) {
   4272 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4273 		alu.op = ctx->inst_info->op;
   4274 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   4275 			r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
   4276 
   4277 			/* RSQ should take the absolute value of src */
   4278 			if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
   4279 				r600_bytecode_src_set_abs(&alu.src[j]);
   4280 			}
   4281 		}
   4282 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4283 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   4284 
   4285 		if (i == last_slot - 1)
   4286 			alu.last = 1;
   4287 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4288 		if (r)
   4289 			return r;
   4290 	}
   4291 	return 0;
   4292 }
   4293 
   4294 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
   4295 {
   4296 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4297 	int i, j, k, r;
   4298 	struct r600_bytecode_alu alu;
   4299 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4300 	int t1 = ctx->temp_reg;
   4301 
   4302 	for (k = 0; k <= lasti; k++) {
   4303 		if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
   4304 			continue;
   4305 
   4306 		for (i = 0 ; i < 4; i++) {
   4307 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4308 			alu.op = ctx->inst_info->op;
   4309 			for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   4310 				r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
   4311 			}
   4312 			alu.dst.sel = t1;
   4313 			alu.dst.chan = i;
   4314 			alu.dst.write = (i == k);
   4315 			if (i == 3)
   4316 				alu.last = 1;
   4317 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4318 			if (r)
   4319 				return r;
   4320 		}
   4321 	}
   4322 
   4323 	for (i = 0 ; i <= lasti; i++) {
   4324 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4325 			continue;
   4326 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4327 		alu.op = ALU_OP1_MOV;
   4328 		alu.src[0].sel = t1;
   4329 		alu.src[0].chan = i;
   4330 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4331 		alu.dst.write = 1;
   4332 		if (i == lasti)
   4333 			alu.last = 1;
   4334 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4335 		if (r)
   4336 			return r;
   4337 	}
   4338 
   4339 	return 0;
   4340 }
   4341 
   4342 
   4343 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
   4344 {
   4345 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4346 	int i, j, k, r;
   4347 	struct r600_bytecode_alu alu;
   4348 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4349 	int t1 = ctx->temp_reg;
   4350 
   4351 	/* t1 would get overwritten below if we actually tried to
   4352 	 * multiply two pairs of doubles at a time. */
   4353 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
   4354 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
   4355 
   4356 	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
   4357 
   4358 	for (i = 0; i < 4; i++) {
   4359 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4360 		alu.op = ctx->inst_info->op;
   4361 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   4362 			r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
   4363 		}
   4364 		alu.dst.sel = t1;
   4365 		alu.dst.chan = i;
   4366 		alu.dst.write = 1;
   4367 		if (i == 3)
   4368 			alu.last = 1;
   4369 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4370 		if (r)
   4371 			return r;
   4372 	}
   4373 
   4374 	for (i = 0; i <= lasti; i++) {
   4375 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4376 			continue;
   4377 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4378 		alu.op = ALU_OP1_MOV;
   4379 		alu.src[0].sel = t1;
   4380 		alu.src[0].chan = i;
   4381 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4382 		alu.dst.write = 1;
   4383 		if (i == lasti)
   4384 			alu.last = 1;
   4385 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4386 		if (r)
   4387 			return r;
   4388 	}
   4389 
   4390 	return 0;
   4391 }
   4392 
   4393 /*
   4394  * Emit RECIP_64 + MUL_64 to implement division.
   4395  */
   4396 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
   4397 {
   4398 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4399 	int r;
   4400 	struct r600_bytecode_alu alu;
   4401 	int t1 = ctx->temp_reg;
   4402 	int k;
   4403 
   4404 	/* Only support one double at a time. This is the same constraint as
   4405 	 * in DMUL lowering. */
   4406 	assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
   4407 	       inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
   4408 
   4409 	k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
   4410 
   4411 	r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
   4412 	if (r)
   4413 		return r;
   4414 
   4415 	for (int i = 0; i < 4; i++) {
   4416 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4417 		alu.op = ALU_OP2_MUL_64;
   4418 
   4419 		r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
   4420 
   4421 		alu.src[1].sel = t1;
   4422 		alu.src[1].chan = (i == 3) ? 0 : 1;
   4423 
   4424 		alu.dst.sel = t1;
   4425 		alu.dst.chan = i;
   4426 		alu.dst.write = 1;
   4427 		if (i == 3)
   4428 			alu.last = 1;
   4429 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4430 		if (r)
   4431 			return r;
   4432 	}
   4433 
   4434 	for (int i = 0; i < 2; i++) {
   4435 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4436 		alu.op = ALU_OP1_MOV;
   4437 		alu.src[0].sel = t1;
   4438 		alu.src[0].chan = i;
   4439 		tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
   4440 		alu.dst.write = 1;
   4441 		if (i == 1)
   4442 			alu.last = 1;
   4443 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4444 		if (r)
   4445 			return r;
   4446 	}
   4447 	return 0;
   4448 }
   4449 
   4450 /*
   4451  * r600 - trunc to -PI..PI range
   4452  * r700 - normalize by dividing by 2PI
   4453  * see fdo bug 27901
   4454  */
   4455 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
   4456 {
   4457 	int r;
   4458 	struct r600_bytecode_alu alu;
   4459 
   4460 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4461 	alu.op = ALU_OP3_MULADD;
   4462 	alu.is_op3 = 1;
   4463 
   4464 	alu.dst.chan = 0;
   4465 	alu.dst.sel = ctx->temp_reg;
   4466 	alu.dst.write = 1;
   4467 
   4468 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4469 
   4470 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   4471 	alu.src[1].chan = 0;
   4472 	alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
   4473 	alu.src[2].sel = V_SQ_ALU_SRC_0_5;
   4474 	alu.src[2].chan = 0;
   4475 	alu.last = 1;
   4476 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4477 	if (r)
   4478 		return r;
   4479 
   4480 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4481 	alu.op = ALU_OP1_FRACT;
   4482 
   4483 	alu.dst.chan = 0;
   4484 	alu.dst.sel = ctx->temp_reg;
   4485 	alu.dst.write = 1;
   4486 
   4487 	alu.src[0].sel = ctx->temp_reg;
   4488 	alu.src[0].chan = 0;
   4489 	alu.last = 1;
   4490 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4491 	if (r)
   4492 		return r;
   4493 
   4494 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4495 	alu.op = ALU_OP3_MULADD;
   4496 	alu.is_op3 = 1;
   4497 
   4498 	alu.dst.chan = 0;
   4499 	alu.dst.sel = ctx->temp_reg;
   4500 	alu.dst.write = 1;
   4501 
   4502 	alu.src[0].sel = ctx->temp_reg;
   4503 	alu.src[0].chan = 0;
   4504 
   4505 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   4506 	alu.src[1].chan = 0;
   4507 	alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
   4508 	alu.src[2].chan = 0;
   4509 
   4510 	if (ctx->bc->chip_class == R600) {
   4511 		alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
   4512 		alu.src[2].value = u_bitcast_f2u(-M_PI);
   4513 	} else {
   4514 		alu.src[1].sel = V_SQ_ALU_SRC_1;
   4515 		alu.src[2].sel = V_SQ_ALU_SRC_0_5;
   4516 		alu.src[2].neg = 1;
   4517 	}
   4518 
   4519 	alu.last = 1;
   4520 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4521 	if (r)
   4522 		return r;
   4523 	return 0;
   4524 }
   4525 
   4526 static int cayman_trig(struct r600_shader_ctx *ctx)
   4527 {
   4528 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4529 	struct r600_bytecode_alu alu;
   4530 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
   4531 	int i, r;
   4532 
   4533 	r = tgsi_setup_trig(ctx);
   4534 	if (r)
   4535 		return r;
   4536 
   4537 
   4538 	for (i = 0; i < last_slot; i++) {
   4539 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4540 		alu.op = ctx->inst_info->op;
   4541 		alu.dst.chan = i;
   4542 
   4543 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4544 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   4545 
   4546 		alu.src[0].sel = ctx->temp_reg;
   4547 		alu.src[0].chan = 0;
   4548 		if (i == last_slot - 1)
   4549 			alu.last = 1;
   4550 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4551 		if (r)
   4552 			return r;
   4553 	}
   4554 	return 0;
   4555 }
   4556 
   4557 static int tgsi_trig(struct r600_shader_ctx *ctx)
   4558 {
   4559 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4560 	struct r600_bytecode_alu alu;
   4561 	int i, r;
   4562 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   4563 
   4564 	r = tgsi_setup_trig(ctx);
   4565 	if (r)
   4566 		return r;
   4567 
   4568 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4569 	alu.op = ctx->inst_info->op;
   4570 	alu.dst.chan = 0;
   4571 	alu.dst.sel = ctx->temp_reg;
   4572 	alu.dst.write = 1;
   4573 
   4574 	alu.src[0].sel = ctx->temp_reg;
   4575 	alu.src[0].chan = 0;
   4576 	alu.last = 1;
   4577 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4578 	if (r)
   4579 		return r;
   4580 
   4581 	/* replicate result */
   4582 	for (i = 0; i < lasti + 1; i++) {
   4583 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   4584 			continue;
   4585 
   4586 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4587 		alu.op = ALU_OP1_MOV;
   4588 
   4589 		alu.src[0].sel = ctx->temp_reg;
   4590 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4591 		if (i == lasti)
   4592 			alu.last = 1;
   4593 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4594 		if (r)
   4595 			return r;
   4596 	}
   4597 	return 0;
   4598 }
   4599 
   4600 static int tgsi_scs(struct r600_shader_ctx *ctx)
   4601 {
   4602 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4603 	struct r600_bytecode_alu alu;
   4604 	int i, r;
   4605 
   4606 	/* We'll only need the trig stuff if we are going to write to the
   4607 	 * X or Y components of the destination vector.
   4608 	 */
   4609 	if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
   4610 		r = tgsi_setup_trig(ctx);
   4611 		if (r)
   4612 			return r;
   4613 	}
   4614 
   4615 	/* dst.x = COS */
   4616 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
   4617 		if (ctx->bc->chip_class == CAYMAN) {
   4618 			for (i = 0 ; i < 3; i++) {
   4619 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4620 				alu.op = ALU_OP1_COS;
   4621 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4622 
   4623 				if (i == 0)
   4624 					alu.dst.write = 1;
   4625 				else
   4626 					alu.dst.write = 0;
   4627 				alu.src[0].sel = ctx->temp_reg;
   4628 				alu.src[0].chan = 0;
   4629 				if (i == 2)
   4630 					alu.last = 1;
   4631 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4632 				if (r)
   4633 					return r;
   4634 			}
   4635 		} else {
   4636 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4637 			alu.op = ALU_OP1_COS;
   4638 			tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
   4639 
   4640 			alu.src[0].sel = ctx->temp_reg;
   4641 			alu.src[0].chan = 0;
   4642 			alu.last = 1;
   4643 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4644 			if (r)
   4645 				return r;
   4646 		}
   4647 	}
   4648 
   4649 	/* dst.y = SIN */
   4650 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
   4651 		if (ctx->bc->chip_class == CAYMAN) {
   4652 			for (i = 0 ; i < 3; i++) {
   4653 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4654 				alu.op = ALU_OP1_SIN;
   4655 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4656 				if (i == 1)
   4657 					alu.dst.write = 1;
   4658 				else
   4659 					alu.dst.write = 0;
   4660 				alu.src[0].sel = ctx->temp_reg;
   4661 				alu.src[0].chan = 0;
   4662 				if (i == 2)
   4663 					alu.last = 1;
   4664 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4665 				if (r)
   4666 					return r;
   4667 			}
   4668 		} else {
   4669 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4670 			alu.op = ALU_OP1_SIN;
   4671 			tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
   4672 
   4673 			alu.src[0].sel = ctx->temp_reg;
   4674 			alu.src[0].chan = 0;
   4675 			alu.last = 1;
   4676 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4677 			if (r)
   4678 				return r;
   4679 		}
   4680 	}
   4681 
   4682 	/* dst.z = 0.0; */
   4683 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
   4684 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4685 
   4686 		alu.op = ALU_OP1_MOV;
   4687 
   4688 		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
   4689 
   4690 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   4691 		alu.src[0].chan = 0;
   4692 
   4693 		alu.last = 1;
   4694 
   4695 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4696 		if (r)
   4697 			return r;
   4698 	}
   4699 
   4700 	/* dst.w = 1.0; */
   4701 	if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
   4702 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4703 
   4704 		alu.op = ALU_OP1_MOV;
   4705 
   4706 		tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
   4707 
   4708 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   4709 		alu.src[0].chan = 0;
   4710 
   4711 		alu.last = 1;
   4712 
   4713 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4714 		if (r)
   4715 			return r;
   4716 	}
   4717 
   4718 	return 0;
   4719 }
   4720 
   4721 static int tgsi_kill(struct r600_shader_ctx *ctx)
   4722 {
   4723 	const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4724 	struct r600_bytecode_alu alu;
   4725 	int i, r;
   4726 
   4727 	for (i = 0; i < 4; i++) {
   4728 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4729 		alu.op = ctx->inst_info->op;
   4730 
   4731 		alu.dst.chan = i;
   4732 
   4733 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   4734 
   4735 		if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
   4736 			alu.src[1].sel = V_SQ_ALU_SRC_1;
   4737 			alu.src[1].neg = 1;
   4738 		} else {
   4739 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   4740 		}
   4741 		if (i == 3) {
   4742 			alu.last = 1;
   4743 		}
   4744 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4745 		if (r)
   4746 			return r;
   4747 	}
   4748 
   4749 	/* kill must be last in ALU */
   4750 	ctx->bc->force_add_cf = 1;
   4751 	ctx->shader->uses_kill = TRUE;
   4752 	return 0;
   4753 }
   4754 
   4755 static int tgsi_lit(struct r600_shader_ctx *ctx)
   4756 {
   4757 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4758 	struct r600_bytecode_alu alu;
   4759 	int r;
   4760 
   4761 	/* tmp.x = max(src.y, 0.0) */
   4762 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4763 	alu.op = ALU_OP2_MAX;
   4764 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
   4765 	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
   4766 	alu.src[1].chan = 1;
   4767 
   4768 	alu.dst.sel = ctx->temp_reg;
   4769 	alu.dst.chan = 0;
   4770 	alu.dst.write = 1;
   4771 
   4772 	alu.last = 1;
   4773 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4774 	if (r)
   4775 		return r;
   4776 
   4777 	if (inst->Dst[0].Register.WriteMask & (1 << 2))
   4778 	{
   4779 		int chan;
   4780 		int sel;
   4781 		unsigned i;
   4782 
   4783 		if (ctx->bc->chip_class == CAYMAN) {
   4784 			for (i = 0; i < 3; i++) {
   4785 				/* tmp.z = log(tmp.x) */
   4786 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4787 				alu.op = ALU_OP1_LOG_CLAMPED;
   4788 				alu.src[0].sel = ctx->temp_reg;
   4789 				alu.src[0].chan = 0;
   4790 				alu.dst.sel = ctx->temp_reg;
   4791 				alu.dst.chan = i;
   4792 				if (i == 2) {
   4793 					alu.dst.write = 1;
   4794 					alu.last = 1;
   4795 				} else
   4796 					alu.dst.write = 0;
   4797 
   4798 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4799 				if (r)
   4800 					return r;
   4801 			}
   4802 		} else {
   4803 			/* tmp.z = log(tmp.x) */
   4804 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4805 			alu.op = ALU_OP1_LOG_CLAMPED;
   4806 			alu.src[0].sel = ctx->temp_reg;
   4807 			alu.src[0].chan = 0;
   4808 			alu.dst.sel = ctx->temp_reg;
   4809 			alu.dst.chan = 2;
   4810 			alu.dst.write = 1;
   4811 			alu.last = 1;
   4812 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4813 			if (r)
   4814 				return r;
   4815 		}
   4816 
   4817 		chan = alu.dst.chan;
   4818 		sel = alu.dst.sel;
   4819 
   4820 		/* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
   4821 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4822 		alu.op = ALU_OP3_MUL_LIT;
   4823 		alu.src[0].sel  = sel;
   4824 		alu.src[0].chan = chan;
   4825 		r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
   4826 		r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
   4827 		alu.dst.sel = ctx->temp_reg;
   4828 		alu.dst.chan = 0;
   4829 		alu.dst.write = 1;
   4830 		alu.is_op3 = 1;
   4831 		alu.last = 1;
   4832 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4833 		if (r)
   4834 			return r;
   4835 
   4836 		if (ctx->bc->chip_class == CAYMAN) {
   4837 			for (i = 0; i < 3; i++) {
   4838 				/* dst.z = exp(tmp.x) */
   4839 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4840 				alu.op = ALU_OP1_EXP_IEEE;
   4841 				alu.src[0].sel = ctx->temp_reg;
   4842 				alu.src[0].chan = 0;
   4843 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4844 				if (i == 2) {
   4845 					alu.dst.write = 1;
   4846 					alu.last = 1;
   4847 				} else
   4848 					alu.dst.write = 0;
   4849 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   4850 				if (r)
   4851 					return r;
   4852 			}
   4853 		} else {
   4854 			/* dst.z = exp(tmp.x) */
   4855 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4856 			alu.op = ALU_OP1_EXP_IEEE;
   4857 			alu.src[0].sel = ctx->temp_reg;
   4858 			alu.src[0].chan = 0;
   4859 			tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
   4860 			alu.last = 1;
   4861 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   4862 			if (r)
   4863 				return r;
   4864 		}
   4865 	}
   4866 
   4867 	/* dst.x, <- 1.0  */
   4868 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4869 	alu.op = ALU_OP1_MOV;
   4870 	alu.src[0].sel  = V_SQ_ALU_SRC_1; /*1.0*/
   4871 	alu.src[0].chan = 0;
   4872 	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
   4873 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
   4874 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4875 	if (r)
   4876 		return r;
   4877 
   4878 	/* dst.y = max(src.x, 0.0) */
   4879 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4880 	alu.op = ALU_OP2_MAX;
   4881 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4882 	alu.src[1].sel  = V_SQ_ALU_SRC_0; /*0.0*/
   4883 	alu.src[1].chan = 0;
   4884 	tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
   4885 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
   4886 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4887 	if (r)
   4888 		return r;
   4889 
   4890 	/* dst.w, <- 1.0  */
   4891 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4892 	alu.op = ALU_OP1_MOV;
   4893 	alu.src[0].sel  = V_SQ_ALU_SRC_1;
   4894 	alu.src[0].chan = 0;
   4895 	tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
   4896 	alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
   4897 	alu.last = 1;
   4898 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4899 	if (r)
   4900 		return r;
   4901 
   4902 	return 0;
   4903 }
   4904 
   4905 static int tgsi_rsq(struct r600_shader_ctx *ctx)
   4906 {
   4907 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4908 	struct r600_bytecode_alu alu;
   4909 	int i, r;
   4910 
   4911 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4912 
   4913 	/* XXX:
   4914 	 * For state trackers other than OpenGL, we'll want to use
   4915 	 * _RECIPSQRT_IEEE instead.
   4916 	 */
   4917 	alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
   4918 
   4919 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
   4920 		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
   4921 		r600_bytecode_src_set_abs(&alu.src[i]);
   4922 	}
   4923 	alu.dst.sel = ctx->temp_reg;
   4924 	alu.dst.write = 1;
   4925 	alu.last = 1;
   4926 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4927 	if (r)
   4928 		return r;
   4929 	/* replicate result */
   4930 	return tgsi_helper_tempx_replicate(ctx);
   4931 }
   4932 
   4933 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
   4934 {
   4935 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4936 	struct r600_bytecode_alu alu;
   4937 	int i, r;
   4938 
   4939 	for (i = 0; i < 4; i++) {
   4940 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4941 		alu.src[0].sel = ctx->temp_reg;
   4942 		alu.op = ALU_OP1_MOV;
   4943 		alu.dst.chan = i;
   4944 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   4945 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   4946 		if (i == 3)
   4947 			alu.last = 1;
   4948 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4949 		if (r)
   4950 			return r;
   4951 	}
   4952 	return 0;
   4953 }
   4954 
   4955 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
   4956 {
   4957 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4958 	struct r600_bytecode_alu alu;
   4959 	int i, r;
   4960 
   4961 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4962 	alu.op = ctx->inst_info->op;
   4963 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
   4964 		r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
   4965 	}
   4966 	alu.dst.sel = ctx->temp_reg;
   4967 	alu.dst.write = 1;
   4968 	alu.last = 1;
   4969 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   4970 	if (r)
   4971 		return r;
   4972 	/* replicate result */
   4973 	return tgsi_helper_tempx_replicate(ctx);
   4974 }
   4975 
   4976 static int cayman_pow(struct r600_shader_ctx *ctx)
   4977 {
   4978 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   4979 	int i, r;
   4980 	struct r600_bytecode_alu alu;
   4981 	int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
   4982 
   4983 	for (i = 0; i < 3; i++) {
   4984 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4985 		alu.op = ALU_OP1_LOG_IEEE;
   4986 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   4987 		alu.dst.sel = ctx->temp_reg;
   4988 		alu.dst.chan = i;
   4989 		alu.dst.write = 1;
   4990 		if (i == 2)
   4991 			alu.last = 1;
   4992 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   4993 		if (r)
   4994 			return r;
   4995 	}
   4996 
   4997 	/* b * LOG2(a) */
   4998 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   4999 	alu.op = ALU_OP2_MUL;
   5000 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
   5001 	alu.src[1].sel = ctx->temp_reg;
   5002 	alu.dst.sel = ctx->temp_reg;
   5003 	alu.dst.write = 1;
   5004 	alu.last = 1;
   5005 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   5006 	if (r)
   5007 		return r;
   5008 
   5009 	for (i = 0; i < last_slot; i++) {
   5010 		/* POW(a,b) = EXP2(b * LOG2(a))*/
   5011 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5012 		alu.op = ALU_OP1_EXP_IEEE;
   5013 		alu.src[0].sel = ctx->temp_reg;
   5014 
   5015 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   5016 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   5017 		if (i == last_slot - 1)
   5018 			alu.last = 1;
   5019 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   5020 		if (r)
   5021 			return r;
   5022 	}
   5023 	return 0;
   5024 }
   5025 
   5026 static int tgsi_pow(struct r600_shader_ctx *ctx)
   5027 {
   5028 	struct r600_bytecode_alu alu;
   5029 	int r;
   5030 
   5031 	/* LOG2(a) */
   5032 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5033 	alu.op = ALU_OP1_LOG_IEEE;
   5034 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   5035 	alu.dst.sel = ctx->temp_reg;
   5036 	alu.dst.write = 1;
   5037 	alu.last = 1;
   5038 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   5039 	if (r)
   5040 		return r;
   5041 	/* b * LOG2(a) */
   5042 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5043 	alu.op = ALU_OP2_MUL;
   5044 	r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
   5045 	alu.src[1].sel = ctx->temp_reg;
   5046 	alu.dst.sel = ctx->temp_reg;
   5047 	alu.dst.write = 1;
   5048 	alu.last = 1;
   5049 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   5050 	if (r)
   5051 		return r;
   5052 	/* POW(a,b) = EXP2(b * LOG2(a))*/
   5053 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5054 	alu.op = ALU_OP1_EXP_IEEE;
   5055 	alu.src[0].sel = ctx->temp_reg;
   5056 	alu.dst.sel = ctx->temp_reg;
   5057 	alu.dst.write = 1;
   5058 	alu.last = 1;
   5059 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   5060 	if (r)
   5061 		return r;
   5062 	return tgsi_helper_tempx_replicate(ctx);
   5063 }
   5064 
   5065 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
   5066 {
   5067 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   5068 	struct r600_bytecode_alu alu;
   5069 	int i, r, j;
   5070 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   5071 	int tmp0 = ctx->temp_reg;
   5072 	int tmp1 = r600_get_temp(ctx);
   5073 	int tmp2 = r600_get_temp(ctx);
   5074 	int tmp3 = r600_get_temp(ctx);
   5075 	/* Unsigned path:
   5076 	 *
   5077 	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
   5078 	 *
   5079 	 * 1. tmp0.x = rcp (src2)     = 2^32/src2 + e, where e is rounding error
   5080 	 * 2. tmp0.z = lo (tmp0.x * src2)
   5081 	 * 3. tmp0.w = -tmp0.z
   5082 	 * 4. tmp0.y = hi (tmp0.x * src2)
   5083 	 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src2))
   5084 	 * 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error
   5085 	 * 7. tmp1.x = tmp0.x - tmp0.w
   5086 	 * 8. tmp1.y = tmp0.x + tmp0.w
   5087 	 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
   5088 	 * 10. tmp0.z = hi(tmp0.x * src1)     = q
   5089 	 * 11. tmp0.y = lo (tmp0.z * src2)     = src2*q = src1 - r
   5090 	 *
   5091 	 * 12. tmp0.w = src1 - tmp0.y       = r
   5092 	 * 13. tmp1.x = tmp0.w >= src2		= r >= src2 (uint comparison)
   5093 	 * 14. tmp1.y = src1 >= tmp0.y      = r >= 0 (uint comparison)
   5094 	 *
   5095 	 * if DIV
   5096 	 *
   5097 	 *   15. tmp1.z = tmp0.z + 1			= q + 1
   5098 	 *   16. tmp1.w = tmp0.z - 1			= q - 1
   5099 	 *
   5100 	 * else MOD
   5101 	 *
   5102 	 *   15. tmp1.z = tmp0.w - src2			= r - src2
   5103 	 *   16. tmp1.w = tmp0.w + src2			= r + src2
   5104 	 *
   5105 	 * endif
   5106 	 *
   5107 	 * 17. tmp1.x = tmp1.x & tmp1.y
   5108 	 *
   5109 	 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
   5110 	 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
   5111 	 *
   5112 	 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
   5113 	 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
   5114 	 *
   5115 	 * Signed path:
   5116 	 *
   5117 	 * Same as unsigned, using abs values of the operands,
   5118 	 * and fixing the sign of the result in the end.
   5119 	 */
   5120 
   5121 	for (i = 0; i < 4; i++) {
   5122 		if (!(write_mask & (1<<i)))
   5123 			continue;
   5124 
   5125 		if (signed_op) {
   5126 
   5127 			/* tmp2.x = -src0 */
   5128 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5129 			alu.op = ALU_OP2_SUB_INT;
   5130 
   5131 			alu.dst.sel = tmp2;
   5132 			alu.dst.chan = 0;
   5133 			alu.dst.write = 1;
   5134 
   5135 			alu.src[0].sel = V_SQ_ALU_SRC_0;
   5136 
   5137 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   5138 
   5139 			alu.last = 1;
   5140 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5141 				return r;
   5142 
   5143 			/* tmp2.y = -src1 */
   5144 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5145 			alu.op = ALU_OP2_SUB_INT;
   5146 
   5147 			alu.dst.sel = tmp2;
   5148 			alu.dst.chan = 1;
   5149 			alu.dst.write = 1;
   5150 
   5151 			alu.src[0].sel = V_SQ_ALU_SRC_0;
   5152 
   5153 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5154 
   5155 			alu.last = 1;
   5156 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5157 				return r;
   5158 
   5159 			/* tmp2.z sign bit is set if src0 and src2 signs are different */
   5160 			/* it will be a sign of the quotient */
   5161 			if (!mod) {
   5162 
   5163 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5164 				alu.op = ALU_OP2_XOR_INT;
   5165 
   5166 				alu.dst.sel = tmp2;
   5167 				alu.dst.chan = 2;
   5168 				alu.dst.write = 1;
   5169 
   5170 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   5171 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5172 
   5173 				alu.last = 1;
   5174 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5175 					return r;
   5176 			}
   5177 
   5178 			/* tmp2.x = |src0| */
   5179 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5180 			alu.op = ALU_OP3_CNDGE_INT;
   5181 			alu.is_op3 = 1;
   5182 
   5183 			alu.dst.sel = tmp2;
   5184 			alu.dst.chan = 0;
   5185 			alu.dst.write = 1;
   5186 
   5187 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   5188 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   5189 			alu.src[2].sel = tmp2;
   5190 			alu.src[2].chan = 0;
   5191 
   5192 			alu.last = 1;
   5193 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5194 				return r;
   5195 
   5196 			/* tmp2.y = |src1| */
   5197 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5198 			alu.op = ALU_OP3_CNDGE_INT;
   5199 			alu.is_op3 = 1;
   5200 
   5201 			alu.dst.sel = tmp2;
   5202 			alu.dst.chan = 1;
   5203 			alu.dst.write = 1;
   5204 
   5205 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   5206 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5207 			alu.src[2].sel = tmp2;
   5208 			alu.src[2].chan = 1;
   5209 
   5210 			alu.last = 1;
   5211 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5212 				return r;
   5213 
   5214 		}
   5215 
   5216 		/* 1. tmp0.x = rcp_u (src2)     = 2^32/src2 + e, where e is rounding error */
   5217 		if (ctx->bc->chip_class == CAYMAN) {
   5218 			/* tmp3.x = u2f(src2) */
   5219 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5220 			alu.op = ALU_OP1_UINT_TO_FLT;
   5221 
   5222 			alu.dst.sel = tmp3;
   5223 			alu.dst.chan = 0;
   5224 			alu.dst.write = 1;
   5225 
   5226 			if (signed_op) {
   5227 				alu.src[0].sel = tmp2;
   5228 				alu.src[0].chan = 1;
   5229 			} else {
   5230 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   5231 			}
   5232 
   5233 			alu.last = 1;
   5234 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5235 				return r;
   5236 
   5237 			/* tmp0.x = recip(tmp3.x) */
   5238 			for (j = 0 ; j < 3; j++) {
   5239 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5240 				alu.op = ALU_OP1_RECIP_IEEE;
   5241 
   5242 				alu.dst.sel = tmp0;
   5243 				alu.dst.chan = j;
   5244 				alu.dst.write = (j == 0);
   5245 
   5246 				alu.src[0].sel = tmp3;
   5247 				alu.src[0].chan = 0;
   5248 
   5249 				if (j == 2)
   5250 					alu.last = 1;
   5251 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5252 					return r;
   5253 			}
   5254 
   5255 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5256 			alu.op = ALU_OP2_MUL;
   5257 
   5258 			alu.src[0].sel = tmp0;
   5259 			alu.src[0].chan = 0;
   5260 
   5261 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   5262 			alu.src[1].value = 0x4f800000;
   5263 
   5264 			alu.dst.sel = tmp3;
   5265 			alu.dst.write = 1;
   5266 			alu.last = 1;
   5267 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   5268 			if (r)
   5269 				return r;
   5270 
   5271 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5272 			alu.op = ALU_OP1_FLT_TO_UINT;
   5273 
   5274 			alu.dst.sel = tmp0;
   5275 			alu.dst.chan = 0;
   5276 			alu.dst.write = 1;
   5277 
   5278 			alu.src[0].sel = tmp3;
   5279 			alu.src[0].chan = 0;
   5280 
   5281 			alu.last = 1;
   5282 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5283 				return r;
   5284 
   5285 		} else {
   5286 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5287 			alu.op = ALU_OP1_RECIP_UINT;
   5288 
   5289 			alu.dst.sel = tmp0;
   5290 			alu.dst.chan = 0;
   5291 			alu.dst.write = 1;
   5292 
   5293 			if (signed_op) {
   5294 				alu.src[0].sel = tmp2;
   5295 				alu.src[0].chan = 1;
   5296 			} else {
   5297 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   5298 			}
   5299 
   5300 			alu.last = 1;
   5301 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5302 				return r;
   5303 		}
   5304 
   5305 		/* 2. tmp0.z = lo (tmp0.x * src2) */
   5306 		if (ctx->bc->chip_class == CAYMAN) {
   5307 			for (j = 0 ; j < 4; j++) {
   5308 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5309 				alu.op = ALU_OP2_MULLO_UINT;
   5310 
   5311 				alu.dst.sel = tmp0;
   5312 				alu.dst.chan = j;
   5313 				alu.dst.write = (j == 2);
   5314 
   5315 				alu.src[0].sel = tmp0;
   5316 				alu.src[0].chan = 0;
   5317 				if (signed_op) {
   5318 					alu.src[1].sel = tmp2;
   5319 					alu.src[1].chan = 1;
   5320 				} else {
   5321 					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5322 				}
   5323 
   5324 				alu.last = (j == 3);
   5325 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5326 					return r;
   5327 			}
   5328 		} else {
   5329 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5330 			alu.op = ALU_OP2_MULLO_UINT;
   5331 
   5332 			alu.dst.sel = tmp0;
   5333 			alu.dst.chan = 2;
   5334 			alu.dst.write = 1;
   5335 
   5336 			alu.src[0].sel = tmp0;
   5337 			alu.src[0].chan = 0;
   5338 			if (signed_op) {
   5339 				alu.src[1].sel = tmp2;
   5340 				alu.src[1].chan = 1;
   5341 			} else {
   5342 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5343 			}
   5344 
   5345 			alu.last = 1;
   5346 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5347 				return r;
   5348 		}
   5349 
   5350 		/* 3. tmp0.w = -tmp0.z */
   5351 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5352 		alu.op = ALU_OP2_SUB_INT;
   5353 
   5354 		alu.dst.sel = tmp0;
   5355 		alu.dst.chan = 3;
   5356 		alu.dst.write = 1;
   5357 
   5358 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   5359 		alu.src[1].sel = tmp0;
   5360 		alu.src[1].chan = 2;
   5361 
   5362 		alu.last = 1;
   5363 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5364 			return r;
   5365 
   5366 		/* 4. tmp0.y = hi (tmp0.x * src2) */
   5367 		if (ctx->bc->chip_class == CAYMAN) {
   5368 			for (j = 0 ; j < 4; j++) {
   5369 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5370 				alu.op = ALU_OP2_MULHI_UINT;
   5371 
   5372 				alu.dst.sel = tmp0;
   5373 				alu.dst.chan = j;
   5374 				alu.dst.write = (j == 1);
   5375 
   5376 				alu.src[0].sel = tmp0;
   5377 				alu.src[0].chan = 0;
   5378 
   5379 				if (signed_op) {
   5380 					alu.src[1].sel = tmp2;
   5381 					alu.src[1].chan = 1;
   5382 				} else {
   5383 					r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5384 				}
   5385 				alu.last = (j == 3);
   5386 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5387 					return r;
   5388 			}
   5389 		} else {
   5390 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5391 			alu.op = ALU_OP2_MULHI_UINT;
   5392 
   5393 			alu.dst.sel = tmp0;
   5394 			alu.dst.chan = 1;
   5395 			alu.dst.write = 1;
   5396 
   5397 			alu.src[0].sel = tmp0;
   5398 			alu.src[0].chan = 0;
   5399 
   5400 			if (signed_op) {
   5401 				alu.src[1].sel = tmp2;
   5402 				alu.src[1].chan = 1;
   5403 			} else {
   5404 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5405 			}
   5406 
   5407 			alu.last = 1;
   5408 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5409 				return r;
   5410 		}
   5411 
   5412 		/* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)      = abs(lo(rcp*src)) */
   5413 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5414 		alu.op = ALU_OP3_CNDE_INT;
   5415 		alu.is_op3 = 1;
   5416 
   5417 		alu.dst.sel = tmp0;
   5418 		alu.dst.chan = 2;
   5419 		alu.dst.write = 1;
   5420 
   5421 		alu.src[0].sel = tmp0;
   5422 		alu.src[0].chan = 1;
   5423 		alu.src[1].sel = tmp0;
   5424 		alu.src[1].chan = 3;
   5425 		alu.src[2].sel = tmp0;
   5426 		alu.src[2].chan = 2;
   5427 
   5428 		alu.last = 1;
   5429 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5430 			return r;
   5431 
   5432 		/* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
   5433 		if (ctx->bc->chip_class == CAYMAN) {
   5434 			for (j = 0 ; j < 4; j++) {
   5435 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5436 				alu.op = ALU_OP2_MULHI_UINT;
   5437 
   5438 				alu.dst.sel = tmp0;
   5439 				alu.dst.chan = j;
   5440 				alu.dst.write = (j == 3);
   5441 
   5442 				alu.src[0].sel = tmp0;
   5443 				alu.src[0].chan = 2;
   5444 
   5445 				alu.src[1].sel = tmp0;
   5446 				alu.src[1].chan = 0;
   5447 
   5448 				alu.last = (j == 3);
   5449 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5450 					return r;
   5451 			}
   5452 		} else {
   5453 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5454 			alu.op = ALU_OP2_MULHI_UINT;
   5455 
   5456 			alu.dst.sel = tmp0;
   5457 			alu.dst.chan = 3;
   5458 			alu.dst.write = 1;
   5459 
   5460 			alu.src[0].sel = tmp0;
   5461 			alu.src[0].chan = 2;
   5462 
   5463 			alu.src[1].sel = tmp0;
   5464 			alu.src[1].chan = 0;
   5465 
   5466 			alu.last = 1;
   5467 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5468 				return r;
   5469 		}
   5470 
   5471 		/* 7. tmp1.x = tmp0.x - tmp0.w */
   5472 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5473 		alu.op = ALU_OP2_SUB_INT;
   5474 
   5475 		alu.dst.sel = tmp1;
   5476 		alu.dst.chan = 0;
   5477 		alu.dst.write = 1;
   5478 
   5479 		alu.src[0].sel = tmp0;
   5480 		alu.src[0].chan = 0;
   5481 		alu.src[1].sel = tmp0;
   5482 		alu.src[1].chan = 3;
   5483 
   5484 		alu.last = 1;
   5485 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5486 			return r;
   5487 
   5488 		/* 8. tmp1.y = tmp0.x + tmp0.w */
   5489 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5490 		alu.op = ALU_OP2_ADD_INT;
   5491 
   5492 		alu.dst.sel = tmp1;
   5493 		alu.dst.chan = 1;
   5494 		alu.dst.write = 1;
   5495 
   5496 		alu.src[0].sel = tmp0;
   5497 		alu.src[0].chan = 0;
   5498 		alu.src[1].sel = tmp0;
   5499 		alu.src[1].chan = 3;
   5500 
   5501 		alu.last = 1;
   5502 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5503 			return r;
   5504 
   5505 		/* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
   5506 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5507 		alu.op = ALU_OP3_CNDE_INT;
   5508 		alu.is_op3 = 1;
   5509 
   5510 		alu.dst.sel = tmp0;
   5511 		alu.dst.chan = 0;
   5512 		alu.dst.write = 1;
   5513 
   5514 		alu.src[0].sel = tmp0;
   5515 		alu.src[0].chan = 1;
   5516 		alu.src[1].sel = tmp1;
   5517 		alu.src[1].chan = 1;
   5518 		alu.src[2].sel = tmp1;
   5519 		alu.src[2].chan = 0;
   5520 
   5521 		alu.last = 1;
   5522 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5523 			return r;
   5524 
   5525 		/* 10. tmp0.z = hi(tmp0.x * src1)     = q */
   5526 		if (ctx->bc->chip_class == CAYMAN) {
   5527 			for (j = 0 ; j < 4; j++) {
   5528 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5529 				alu.op = ALU_OP2_MULHI_UINT;
   5530 
   5531 				alu.dst.sel = tmp0;
   5532 				alu.dst.chan = j;
   5533 				alu.dst.write = (j == 2);
   5534 
   5535 				alu.src[0].sel = tmp0;
   5536 				alu.src[0].chan = 0;
   5537 
   5538 				if (signed_op) {
   5539 					alu.src[1].sel = tmp2;
   5540 					alu.src[1].chan = 0;
   5541 				} else {
   5542 					r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   5543 				}
   5544 
   5545 				alu.last = (j == 3);
   5546 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5547 					return r;
   5548 			}
   5549 		} else {
   5550 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5551 			alu.op = ALU_OP2_MULHI_UINT;
   5552 
   5553 			alu.dst.sel = tmp0;
   5554 			alu.dst.chan = 2;
   5555 			alu.dst.write = 1;
   5556 
   5557 			alu.src[0].sel = tmp0;
   5558 			alu.src[0].chan = 0;
   5559 
   5560 			if (signed_op) {
   5561 				alu.src[1].sel = tmp2;
   5562 				alu.src[1].chan = 0;
   5563 			} else {
   5564 				r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   5565 			}
   5566 
   5567 			alu.last = 1;
   5568 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5569 				return r;
   5570 		}
   5571 
   5572 		/* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
   5573 		if (ctx->bc->chip_class == CAYMAN) {
   5574 			for (j = 0 ; j < 4; j++) {
   5575 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5576 				alu.op = ALU_OP2_MULLO_UINT;
   5577 
   5578 				alu.dst.sel = tmp0;
   5579 				alu.dst.chan = j;
   5580 				alu.dst.write = (j == 1);
   5581 
   5582 				if (signed_op) {
   5583 					alu.src[0].sel = tmp2;
   5584 					alu.src[0].chan = 1;
   5585 				} else {
   5586 					r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   5587 				}
   5588 
   5589 				alu.src[1].sel = tmp0;
   5590 				alu.src[1].chan = 2;
   5591 
   5592 				alu.last = (j == 3);
   5593 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5594 					return r;
   5595 			}
   5596 		} else {
   5597 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5598 			alu.op = ALU_OP2_MULLO_UINT;
   5599 
   5600 			alu.dst.sel = tmp0;
   5601 			alu.dst.chan = 1;
   5602 			alu.dst.write = 1;
   5603 
   5604 			if (signed_op) {
   5605 				alu.src[0].sel = tmp2;
   5606 				alu.src[0].chan = 1;
   5607 			} else {
   5608 				r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   5609 			}
   5610 
   5611 			alu.src[1].sel = tmp0;
   5612 			alu.src[1].chan = 2;
   5613 
   5614 			alu.last = 1;
   5615 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5616 				return r;
   5617 		}
   5618 
   5619 		/* 12. tmp0.w = src1 - tmp0.y       = r */
   5620 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5621 		alu.op = ALU_OP2_SUB_INT;
   5622 
   5623 		alu.dst.sel = tmp0;
   5624 		alu.dst.chan = 3;
   5625 		alu.dst.write = 1;
   5626 
   5627 		if (signed_op) {
   5628 			alu.src[0].sel = tmp2;
   5629 			alu.src[0].chan = 0;
   5630 		} else {
   5631 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   5632 		}
   5633 
   5634 		alu.src[1].sel = tmp0;
   5635 		alu.src[1].chan = 1;
   5636 
   5637 		alu.last = 1;
   5638 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5639 			return r;
   5640 
   5641 		/* 13. tmp1.x = tmp0.w >= src2		= r >= src2 */
   5642 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5643 		alu.op = ALU_OP2_SETGE_UINT;
   5644 
   5645 		alu.dst.sel = tmp1;
   5646 		alu.dst.chan = 0;
   5647 		alu.dst.write = 1;
   5648 
   5649 		alu.src[0].sel = tmp0;
   5650 		alu.src[0].chan = 3;
   5651 		if (signed_op) {
   5652 			alu.src[1].sel = tmp2;
   5653 			alu.src[1].chan = 1;
   5654 		} else {
   5655 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5656 		}
   5657 
   5658 		alu.last = 1;
   5659 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5660 			return r;
   5661 
   5662 		/* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
   5663 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5664 		alu.op = ALU_OP2_SETGE_UINT;
   5665 
   5666 		alu.dst.sel = tmp1;
   5667 		alu.dst.chan = 1;
   5668 		alu.dst.write = 1;
   5669 
   5670 		if (signed_op) {
   5671 			alu.src[0].sel = tmp2;
   5672 			alu.src[0].chan = 0;
   5673 		} else {
   5674 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   5675 		}
   5676 
   5677 		alu.src[1].sel = tmp0;
   5678 		alu.src[1].chan = 1;
   5679 
   5680 		alu.last = 1;
   5681 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5682 			return r;
   5683 
   5684 		if (mod) { /* UMOD */
   5685 
   5686 			/* 15. tmp1.z = tmp0.w - src2			= r - src2 */
   5687 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5688 			alu.op = ALU_OP2_SUB_INT;
   5689 
   5690 			alu.dst.sel = tmp1;
   5691 			alu.dst.chan = 2;
   5692 			alu.dst.write = 1;
   5693 
   5694 			alu.src[0].sel = tmp0;
   5695 			alu.src[0].chan = 3;
   5696 
   5697 			if (signed_op) {
   5698 				alu.src[1].sel = tmp2;
   5699 				alu.src[1].chan = 1;
   5700 			} else {
   5701 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5702 			}
   5703 
   5704 			alu.last = 1;
   5705 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5706 				return r;
   5707 
   5708 			/* 16. tmp1.w = tmp0.w + src2			= r + src2 */
   5709 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5710 			alu.op = ALU_OP2_ADD_INT;
   5711 
   5712 			alu.dst.sel = tmp1;
   5713 			alu.dst.chan = 3;
   5714 			alu.dst.write = 1;
   5715 
   5716 			alu.src[0].sel = tmp0;
   5717 			alu.src[0].chan = 3;
   5718 			if (signed_op) {
   5719 				alu.src[1].sel = tmp2;
   5720 				alu.src[1].chan = 1;
   5721 			} else {
   5722 				r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   5723 			}
   5724 
   5725 			alu.last = 1;
   5726 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5727 				return r;
   5728 
   5729 		} else { /* UDIV */
   5730 
   5731 			/* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
   5732 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5733 			alu.op = ALU_OP2_ADD_INT;
   5734 
   5735 			alu.dst.sel = tmp1;
   5736 			alu.dst.chan = 2;
   5737 			alu.dst.write = 1;
   5738 
   5739 			alu.src[0].sel = tmp0;
   5740 			alu.src[0].chan = 2;
   5741 			alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
   5742 
   5743 			alu.last = 1;
   5744 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5745 				return r;
   5746 
   5747 			/* 16. tmp1.w = tmp0.z - 1			= q - 1 */
   5748 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5749 			alu.op = ALU_OP2_ADD_INT;
   5750 
   5751 			alu.dst.sel = tmp1;
   5752 			alu.dst.chan = 3;
   5753 			alu.dst.write = 1;
   5754 
   5755 			alu.src[0].sel = tmp0;
   5756 			alu.src[0].chan = 2;
   5757 			alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
   5758 
   5759 			alu.last = 1;
   5760 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5761 				return r;
   5762 
   5763 		}
   5764 
   5765 		/* 17. tmp1.x = tmp1.x & tmp1.y */
   5766 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5767 		alu.op = ALU_OP2_AND_INT;
   5768 
   5769 		alu.dst.sel = tmp1;
   5770 		alu.dst.chan = 0;
   5771 		alu.dst.write = 1;
   5772 
   5773 		alu.src[0].sel = tmp1;
   5774 		alu.src[0].chan = 0;
   5775 		alu.src[1].sel = tmp1;
   5776 		alu.src[1].chan = 1;
   5777 
   5778 		alu.last = 1;
   5779 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5780 			return r;
   5781 
   5782 		/* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
   5783 		/* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
   5784 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5785 		alu.op = ALU_OP3_CNDE_INT;
   5786 		alu.is_op3 = 1;
   5787 
   5788 		alu.dst.sel = tmp0;
   5789 		alu.dst.chan = 2;
   5790 		alu.dst.write = 1;
   5791 
   5792 		alu.src[0].sel = tmp1;
   5793 		alu.src[0].chan = 0;
   5794 		alu.src[1].sel = tmp0;
   5795 		alu.src[1].chan = mod ? 3 : 2;
   5796 		alu.src[2].sel = tmp1;
   5797 		alu.src[2].chan = 2;
   5798 
   5799 		alu.last = 1;
   5800 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5801 			return r;
   5802 
   5803 		/* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
   5804 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5805 		alu.op = ALU_OP3_CNDE_INT;
   5806 		alu.is_op3 = 1;
   5807 
   5808 		if (signed_op) {
   5809 			alu.dst.sel = tmp0;
   5810 			alu.dst.chan = 2;
   5811 			alu.dst.write = 1;
   5812 		} else {
   5813 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   5814 		}
   5815 
   5816 		alu.src[0].sel = tmp1;
   5817 		alu.src[0].chan = 1;
   5818 		alu.src[1].sel = tmp1;
   5819 		alu.src[1].chan = 3;
   5820 		alu.src[2].sel = tmp0;
   5821 		alu.src[2].chan = 2;
   5822 
   5823 		alu.last = 1;
   5824 		if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5825 			return r;
   5826 
   5827 		if (signed_op) {
   5828 
   5829 			/* fix the sign of the result */
   5830 
   5831 			if (mod) {
   5832 
   5833 				/* tmp0.x = -tmp0.z */
   5834 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5835 				alu.op = ALU_OP2_SUB_INT;
   5836 
   5837 				alu.dst.sel = tmp0;
   5838 				alu.dst.chan = 0;
   5839 				alu.dst.write = 1;
   5840 
   5841 				alu.src[0].sel = V_SQ_ALU_SRC_0;
   5842 				alu.src[1].sel = tmp0;
   5843 				alu.src[1].chan = 2;
   5844 
   5845 				alu.last = 1;
   5846 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5847 					return r;
   5848 
   5849 				/* sign of the remainder is the same as the sign of src0 */
   5850 				/* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
   5851 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5852 				alu.op = ALU_OP3_CNDGE_INT;
   5853 				alu.is_op3 = 1;
   5854 
   5855 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   5856 
   5857 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   5858 				alu.src[1].sel = tmp0;
   5859 				alu.src[1].chan = 2;
   5860 				alu.src[2].sel = tmp0;
   5861 				alu.src[2].chan = 0;
   5862 
   5863 				alu.last = 1;
   5864 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5865 					return r;
   5866 
   5867 			} else {
   5868 
   5869 				/* tmp0.x = -tmp0.z */
   5870 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5871 				alu.op = ALU_OP2_SUB_INT;
   5872 
   5873 				alu.dst.sel = tmp0;
   5874 				alu.dst.chan = 0;
   5875 				alu.dst.write = 1;
   5876 
   5877 				alu.src[0].sel = V_SQ_ALU_SRC_0;
   5878 				alu.src[1].sel = tmp0;
   5879 				alu.src[1].chan = 2;
   5880 
   5881 				alu.last = 1;
   5882 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5883 					return r;
   5884 
   5885 				/* fix the quotient sign (same as the sign of src0*src1) */
   5886 				/* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
   5887 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5888 				alu.op = ALU_OP3_CNDGE_INT;
   5889 				alu.is_op3 = 1;
   5890 
   5891 				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   5892 
   5893 				alu.src[0].sel = tmp2;
   5894 				alu.src[0].chan = 2;
   5895 				alu.src[1].sel = tmp0;
   5896 				alu.src[1].chan = 2;
   5897 				alu.src[2].sel = tmp0;
   5898 				alu.src[2].chan = 0;
   5899 
   5900 				alu.last = 1;
   5901 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   5902 					return r;
   5903 			}
   5904 		}
   5905 	}
   5906 	return 0;
   5907 }
   5908 
   5909 static int tgsi_udiv(struct r600_shader_ctx *ctx)
   5910 {
   5911 	return tgsi_divmod(ctx, 0, 0);
   5912 }
   5913 
   5914 static int tgsi_umod(struct r600_shader_ctx *ctx)
   5915 {
   5916 	return tgsi_divmod(ctx, 1, 0);
   5917 }
   5918 
   5919 static int tgsi_idiv(struct r600_shader_ctx *ctx)
   5920 {
   5921 	return tgsi_divmod(ctx, 0, 1);
   5922 }
   5923 
   5924 static int tgsi_imod(struct r600_shader_ctx *ctx)
   5925 {
   5926 	return tgsi_divmod(ctx, 1, 1);
   5927 }
   5928 
   5929 
   5930 static int tgsi_f2i(struct r600_shader_ctx *ctx)
   5931 {
   5932 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   5933 	struct r600_bytecode_alu alu;
   5934 	int i, r;
   5935 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   5936 	int last_inst = tgsi_last_instruction(write_mask);
   5937 
   5938 	for (i = 0; i < 4; i++) {
   5939 		if (!(write_mask & (1<<i)))
   5940 			continue;
   5941 
   5942 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5943 		alu.op = ALU_OP1_TRUNC;
   5944 
   5945 		alu.dst.sel = ctx->temp_reg;
   5946 		alu.dst.chan = i;
   5947 		alu.dst.write = 1;
   5948 
   5949 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   5950 		if (i == last_inst)
   5951 			alu.last = 1;
   5952 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   5953 		if (r)
   5954 			return r;
   5955 	}
   5956 
   5957 	for (i = 0; i < 4; i++) {
   5958 		if (!(write_mask & (1<<i)))
   5959 			continue;
   5960 
   5961 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5962 		alu.op = ctx->inst_info->op;
   5963 
   5964 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   5965 
   5966 		alu.src[0].sel = ctx->temp_reg;
   5967 		alu.src[0].chan = i;
   5968 
   5969 		if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
   5970 			alu.last = 1;
   5971 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   5972 		if (r)
   5973 			return r;
   5974 	}
   5975 
   5976 	return 0;
   5977 }
   5978 
   5979 static int tgsi_iabs(struct r600_shader_ctx *ctx)
   5980 {
   5981 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   5982 	struct r600_bytecode_alu alu;
   5983 	int i, r;
   5984 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   5985 	int last_inst = tgsi_last_instruction(write_mask);
   5986 
   5987 	/* tmp = -src */
   5988 	for (i = 0; i < 4; i++) {
   5989 		if (!(write_mask & (1<<i)))
   5990 			continue;
   5991 
   5992 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   5993 		alu.op = ALU_OP2_SUB_INT;
   5994 
   5995 		alu.dst.sel = ctx->temp_reg;
   5996 		alu.dst.chan = i;
   5997 		alu.dst.write = 1;
   5998 
   5999 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   6000 		alu.src[0].sel = V_SQ_ALU_SRC_0;
   6001 
   6002 		if (i == last_inst)
   6003 			alu.last = 1;
   6004 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6005 		if (r)
   6006 			return r;
   6007 	}
   6008 
   6009 	/* dst = (src >= 0 ? src : tmp) */
   6010 	for (i = 0; i < 4; i++) {
   6011 		if (!(write_mask & (1<<i)))
   6012 			continue;
   6013 
   6014 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6015 		alu.op = ALU_OP3_CNDGE_INT;
   6016 		alu.is_op3 = 1;
   6017 		alu.dst.write = 1;
   6018 
   6019 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6020 
   6021 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   6022 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   6023 		alu.src[2].sel = ctx->temp_reg;
   6024 		alu.src[2].chan = i;
   6025 
   6026 		if (i == last_inst)
   6027 			alu.last = 1;
   6028 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6029 		if (r)
   6030 			return r;
   6031 	}
   6032 	return 0;
   6033 }
   6034 
   6035 static int tgsi_issg(struct r600_shader_ctx *ctx)
   6036 {
   6037 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6038 	struct r600_bytecode_alu alu;
   6039 	int i, r;
   6040 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   6041 	int last_inst = tgsi_last_instruction(write_mask);
   6042 
   6043 	/* tmp = (src >= 0 ? src : -1) */
   6044 	for (i = 0; i < 4; i++) {
   6045 		if (!(write_mask & (1<<i)))
   6046 			continue;
   6047 
   6048 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6049 		alu.op = ALU_OP3_CNDGE_INT;
   6050 		alu.is_op3 = 1;
   6051 
   6052 		alu.dst.sel = ctx->temp_reg;
   6053 		alu.dst.chan = i;
   6054 		alu.dst.write = 1;
   6055 
   6056 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   6057 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   6058 		alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
   6059 
   6060 		if (i == last_inst)
   6061 			alu.last = 1;
   6062 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6063 		if (r)
   6064 			return r;
   6065 	}
   6066 
   6067 	/* dst = (tmp > 0 ? 1 : tmp) */
   6068 	for (i = 0; i < 4; i++) {
   6069 		if (!(write_mask & (1<<i)))
   6070 			continue;
   6071 
   6072 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6073 		alu.op = ALU_OP3_CNDGT_INT;
   6074 		alu.is_op3 = 1;
   6075 		alu.dst.write = 1;
   6076 
   6077 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6078 
   6079 		alu.src[0].sel = ctx->temp_reg;
   6080 		alu.src[0].chan = i;
   6081 
   6082 		alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
   6083 
   6084 		alu.src[2].sel = ctx->temp_reg;
   6085 		alu.src[2].chan = i;
   6086 
   6087 		if (i == last_inst)
   6088 			alu.last = 1;
   6089 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6090 		if (r)
   6091 			return r;
   6092 	}
   6093 	return 0;
   6094 }
   6095 
   6096 
   6097 
   6098 static int tgsi_ssg(struct r600_shader_ctx *ctx)
   6099 {
   6100 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6101 	struct r600_bytecode_alu alu;
   6102 	int i, r;
   6103 
   6104 	/* tmp = (src > 0 ? 1 : src) */
   6105 	for (i = 0; i < 4; i++) {
   6106 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6107 		alu.op = ALU_OP3_CNDGT;
   6108 		alu.is_op3 = 1;
   6109 
   6110 		alu.dst.sel = ctx->temp_reg;
   6111 		alu.dst.chan = i;
   6112 
   6113 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   6114 		alu.src[1].sel = V_SQ_ALU_SRC_1;
   6115 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
   6116 
   6117 		if (i == 3)
   6118 			alu.last = 1;
   6119 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6120 		if (r)
   6121 			return r;
   6122 	}
   6123 
   6124 	/* dst = (-tmp > 0 ? -1 : tmp) */
   6125 	for (i = 0; i < 4; i++) {
   6126 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6127 		alu.op = ALU_OP3_CNDGT;
   6128 		alu.is_op3 = 1;
   6129 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6130 
   6131 		alu.src[0].sel = ctx->temp_reg;
   6132 		alu.src[0].chan = i;
   6133 		alu.src[0].neg = 1;
   6134 
   6135 		alu.src[1].sel = V_SQ_ALU_SRC_1;
   6136 		alu.src[1].neg = 1;
   6137 
   6138 		alu.src[2].sel = ctx->temp_reg;
   6139 		alu.src[2].chan = i;
   6140 
   6141 		if (i == 3)
   6142 			alu.last = 1;
   6143 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6144 		if (r)
   6145 			return r;
   6146 	}
   6147 	return 0;
   6148 }
   6149 
   6150 static int tgsi_bfi(struct r600_shader_ctx *ctx)
   6151 {
   6152 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6153 	struct r600_bytecode_alu alu;
   6154 	int i, r, t1, t2;
   6155 
   6156 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   6157 	int last_inst = tgsi_last_instruction(write_mask);
   6158 
   6159 	t1 = ctx->temp_reg;
   6160 
   6161 	for (i = 0; i < 4; i++) {
   6162 		if (!(write_mask & (1<<i)))
   6163 			continue;
   6164 
   6165 		/* create mask tmp */
   6166 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6167 		alu.op = ALU_OP2_BFM_INT;
   6168 		alu.dst.sel = t1;
   6169 		alu.dst.chan = i;
   6170 		alu.dst.write = 1;
   6171 		alu.last = i == last_inst;
   6172 
   6173 		r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
   6174 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   6175 
   6176 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6177 		if (r)
   6178 			return r;
   6179 	}
   6180 
   6181 	t2 = r600_get_temp(ctx);
   6182 
   6183 	for (i = 0; i < 4; i++) {
   6184 		if (!(write_mask & (1<<i)))
   6185 			continue;
   6186 
   6187 		/* shift insert left */
   6188 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6189 		alu.op = ALU_OP2_LSHL_INT;
   6190 		alu.dst.sel = t2;
   6191 		alu.dst.chan = i;
   6192 		alu.dst.write = 1;
   6193 		alu.last = i == last_inst;
   6194 
   6195 		r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   6196 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   6197 
   6198 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6199 		if (r)
   6200 			return r;
   6201 	}
   6202 
   6203 	for (i = 0; i < 4; i++) {
   6204 		if (!(write_mask & (1<<i)))
   6205 			continue;
   6206 
   6207 		/* actual bitfield insert */
   6208 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6209 		alu.op = ALU_OP3_BFI_INT;
   6210 		alu.is_op3 = 1;
   6211 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6212 		alu.dst.chan = i;
   6213 		alu.dst.write = 1;
   6214 		alu.last = i == last_inst;
   6215 
   6216 		alu.src[0].sel = t1;
   6217 		alu.src[0].chan = i;
   6218 		alu.src[1].sel = t2;
   6219 		alu.src[1].chan = i;
   6220 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
   6221 
   6222 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6223 		if (r)
   6224 			return r;
   6225 	}
   6226 
   6227 	return 0;
   6228 }
   6229 
   6230 static int tgsi_msb(struct r600_shader_ctx *ctx)
   6231 {
   6232 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6233 	struct r600_bytecode_alu alu;
   6234 	int i, r, t1, t2;
   6235 
   6236 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
   6237 	int last_inst = tgsi_last_instruction(write_mask);
   6238 
   6239 	assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
   6240 		ctx->inst_info->op == ALU_OP1_FFBH_UINT);
   6241 
   6242 	t1 = ctx->temp_reg;
   6243 
   6244 	/* bit position is indexed from lsb by TGSI, and from msb by the hardware */
   6245 	for (i = 0; i < 4; i++) {
   6246 		if (!(write_mask & (1<<i)))
   6247 			continue;
   6248 
   6249 		/* t1 = FFBH_INT / FFBH_UINT */
   6250 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6251 		alu.op = ctx->inst_info->op;
   6252 		alu.dst.sel = t1;
   6253 		alu.dst.chan = i;
   6254 		alu.dst.write = 1;
   6255 		alu.last = i == last_inst;
   6256 
   6257 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   6258 
   6259 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6260 		if (r)
   6261 			return r;
   6262 	}
   6263 
   6264 	t2 = r600_get_temp(ctx);
   6265 
   6266 	for (i = 0; i < 4; i++) {
   6267 		if (!(write_mask & (1<<i)))
   6268 			continue;
   6269 
   6270 		/* t2 = 31 - t1 */
   6271 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6272 		alu.op = ALU_OP2_SUB_INT;
   6273 		alu.dst.sel = t2;
   6274 		alu.dst.chan = i;
   6275 		alu.dst.write = 1;
   6276 		alu.last = i == last_inst;
   6277 
   6278 		alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
   6279 		alu.src[0].value = 31;
   6280 		alu.src[1].sel = t1;
   6281 		alu.src[1].chan = i;
   6282 
   6283 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6284 		if (r)
   6285 			return r;
   6286 	}
   6287 
   6288 	for (i = 0; i < 4; i++) {
   6289 		if (!(write_mask & (1<<i)))
   6290 			continue;
   6291 
   6292 		/* result = t1 >= 0 ? t2 : t1 */
   6293 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6294 		alu.op = ALU_OP3_CNDGE_INT;
   6295 		alu.is_op3 = 1;
   6296 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6297 		alu.dst.chan = i;
   6298 		alu.dst.write = 1;
   6299 		alu.last = i == last_inst;
   6300 
   6301 		alu.src[0].sel = t1;
   6302 		alu.src[0].chan = i;
   6303 		alu.src[1].sel = t2;
   6304 		alu.src[1].chan = i;
   6305 		alu.src[2].sel = t1;
   6306 		alu.src[2].chan = i;
   6307 
   6308 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6309 		if (r)
   6310 			return r;
   6311 	}
   6312 
   6313 	return 0;
   6314 }
   6315 
   6316 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
   6317 {
   6318 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6319 	struct r600_bytecode_alu alu;
   6320 	int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
   6321 	unsigned location;
   6322 	int input;
   6323 
   6324 	assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
   6325 
   6326 	input = inst->Src[0].Register.Index;
   6327 
   6328 	/* Interpolators have been marked for use already by allocate_system_value_inputs */
   6329 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
   6330 		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
   6331 		location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
   6332 	}
   6333 	else {
   6334 		location = TGSI_INTERPOLATE_LOC_CENTROID;
   6335 	}
   6336 
   6337 	k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
   6338 	if (k < 0)
   6339 		k = 0;
   6340 	interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
   6341 	interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
   6342 
   6343 	/* NOTE: currently offset is not perspective correct */
   6344 	if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
   6345 		inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
   6346 		int sample_gpr = -1;
   6347 		int gradientsH, gradientsV;
   6348 		struct r600_bytecode_tex tex;
   6349 
   6350 		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
   6351 			sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
   6352 		}
   6353 
   6354 		gradientsH = r600_get_temp(ctx);
   6355 		gradientsV = r600_get_temp(ctx);
   6356 		for (i = 0; i < 2; i++) {
   6357 			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
   6358 			tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
   6359 			tex.src_gpr = interp_gpr;
   6360 			tex.src_sel_x = interp_base_chan + 0;
   6361 			tex.src_sel_y = interp_base_chan + 1;
   6362 			tex.src_sel_z = 0;
   6363 			tex.src_sel_w = 0;
   6364 			tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
   6365 			tex.dst_sel_x = 0;
   6366 			tex.dst_sel_y = 1;
   6367 			tex.dst_sel_z = 7;
   6368 			tex.dst_sel_w = 7;
   6369 			tex.inst_mod = 1; // Use per pixel gradient calculation
   6370 			tex.sampler_id = 0;
   6371 			tex.resource_id = tex.sampler_id;
   6372 			r = r600_bytecode_add_tex(ctx->bc, &tex);
   6373 			if (r)
   6374 				return r;
   6375 		}
   6376 
   6377 		for (i = 0; i < 2; i++) {
   6378 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6379 			alu.op = ALU_OP3_MULADD;
   6380 			alu.is_op3 = 1;
   6381 			alu.src[0].sel = gradientsH;
   6382 			alu.src[0].chan = i;
   6383 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
   6384 				alu.src[1].sel = sample_gpr;
   6385 				alu.src[1].chan = 2;
   6386 			}
   6387 			else {
   6388 				r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
   6389 			}
   6390 			alu.src[2].sel = interp_gpr;
   6391 			alu.src[2].chan = interp_base_chan + i;
   6392 			alu.dst.sel = ctx->temp_reg;
   6393 			alu.dst.chan = i;
   6394 			alu.last = i == 1;
   6395 
   6396 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   6397 			if (r)
   6398 				return r;
   6399 		}
   6400 
   6401 		for (i = 0; i < 2; i++) {
   6402 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6403 			alu.op = ALU_OP3_MULADD;
   6404 			alu.is_op3 = 1;
   6405 			alu.src[0].sel = gradientsV;
   6406 			alu.src[0].chan = i;
   6407 			if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
   6408 				alu.src[1].sel = sample_gpr;
   6409 				alu.src[1].chan = 3;
   6410 			}
   6411 			else {
   6412 				r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
   6413 			}
   6414 			alu.src[2].sel = ctx->temp_reg;
   6415 			alu.src[2].chan = i;
   6416 			alu.dst.sel = ctx->temp_reg;
   6417 			alu.dst.chan = i;
   6418 			alu.last = i == 1;
   6419 
   6420 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   6421 			if (r)
   6422 				return r;
   6423 		}
   6424 	}
   6425 
   6426 	tmp = r600_get_temp(ctx);
   6427 	for (i = 0; i < 8; i++) {
   6428 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6429 		alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
   6430 
   6431 		alu.dst.sel = tmp;
   6432 		if ((i > 1 && i < 6)) {
   6433 			alu.dst.write = 1;
   6434 		}
   6435 		else {
   6436 			alu.dst.write = 0;
   6437 		}
   6438 		alu.dst.chan = i % 4;
   6439 
   6440 		if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
   6441 			inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
   6442 			alu.src[0].sel = ctx->temp_reg;
   6443 			alu.src[0].chan = 1 - (i % 2);
   6444 		} else {
   6445 			alu.src[0].sel = interp_gpr;
   6446 			alu.src[0].chan = interp_base_chan + 1 - (i % 2);
   6447 		}
   6448 		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
   6449 		alu.src[1].chan = 0;
   6450 
   6451 		alu.last = i % 4 == 3;
   6452 		alu.bank_swizzle_force = SQ_ALU_VEC_210;
   6453 
   6454 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6455 		if (r)
   6456 			return r;
   6457 	}
   6458 
   6459 	// INTERP can't swizzle dst
   6460 	lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   6461 	for (i = 0; i <= lasti; i++) {
   6462 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   6463 			continue;
   6464 
   6465 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6466 		alu.op = ALU_OP1_MOV;
   6467 		alu.src[0].sel = tmp;
   6468 		alu.src[0].chan = ctx->src[0].swizzle[i];
   6469 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6470 		alu.dst.write = 1;
   6471 		alu.last = i == lasti;
   6472 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6473 		if (r)
   6474 			return r;
   6475 	}
   6476 
   6477 	return 0;
   6478 }
   6479 
   6480 
   6481 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
   6482 {
   6483 	struct r600_bytecode_alu alu;
   6484 	int i, r;
   6485 
   6486 	for (i = 0; i < 4; i++) {
   6487 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6488 		if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
   6489 			alu.op = ALU_OP0_NOP;
   6490 			alu.dst.chan = i;
   6491 		} else {
   6492 			alu.op = ALU_OP1_MOV;
   6493 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6494 			alu.src[0].sel = ctx->temp_reg;
   6495 			alu.src[0].chan = i;
   6496 		}
   6497 		if (i == 3) {
   6498 			alu.last = 1;
   6499 		}
   6500 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6501 		if (r)
   6502 			return r;
   6503 	}
   6504 	return 0;
   6505 }
   6506 
   6507 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
   6508                                  unsigned temp, int chan,
   6509                                  struct r600_bytecode_alu_src *bc_src,
   6510                                  const struct r600_shader_src *shader_src)
   6511 {
   6512 	struct r600_bytecode_alu alu;
   6513 	int r;
   6514 
   6515 	r600_bytecode_src(bc_src, shader_src, chan);
   6516 
   6517 	/* op3 operands don't support abs modifier */
   6518 	if (bc_src->abs) {
   6519 		assert(temp!=0);      /* we actually need the extra register, make sure it is allocated. */
   6520 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6521 		alu.op = ALU_OP1_MOV;
   6522 		alu.dst.sel = temp;
   6523 		alu.dst.chan = chan;
   6524 		alu.dst.write = 1;
   6525 
   6526 		alu.src[0] = *bc_src;
   6527 		alu.last = true; // sufficient?
   6528 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6529 		if (r)
   6530 			return r;
   6531 
   6532 		memset(bc_src, 0, sizeof(*bc_src));
   6533 		bc_src->sel = temp;
   6534 		bc_src->chan = chan;
   6535 	}
   6536 	return 0;
   6537 }
   6538 
   6539 static int tgsi_op3(struct r600_shader_ctx *ctx)
   6540 {
   6541 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6542 	struct r600_bytecode_alu alu;
   6543 	int i, j, r;
   6544 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   6545 	int temp_regs[4];
   6546 
   6547 	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   6548 		temp_regs[j] = 0;
   6549 		if (ctx->src[j].abs)
   6550 			temp_regs[j] = r600_get_temp(ctx);
   6551 	}
   6552 	for (i = 0; i < lasti + 1; i++) {
   6553 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   6554 			continue;
   6555 
   6556 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6557 		alu.op = ctx->inst_info->op;
   6558 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   6559 			r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
   6560 			if (r)
   6561 				return r;
   6562 		}
   6563 
   6564 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6565 		alu.dst.chan = i;
   6566 		alu.dst.write = 1;
   6567 		alu.is_op3 = 1;
   6568 		if (i == lasti) {
   6569 			alu.last = 1;
   6570 		}
   6571 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6572 		if (r)
   6573 			return r;
   6574 	}
   6575 	return 0;
   6576 }
   6577 
   6578 static int tgsi_dp(struct r600_shader_ctx *ctx)
   6579 {
   6580 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6581 	struct r600_bytecode_alu alu;
   6582 	int i, j, r;
   6583 
   6584 	for (i = 0; i < 4; i++) {
   6585 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6586 		alu.op = ctx->inst_info->op;
   6587 		for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   6588 			r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
   6589 		}
   6590 
   6591 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   6592 		alu.dst.chan = i;
   6593 		alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
   6594 		/* handle some special cases */
   6595 		switch (inst->Instruction.Opcode) {
   6596 		case TGSI_OPCODE_DP2:
   6597 			if (i > 1) {
   6598 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
   6599 				alu.src[0].chan = alu.src[1].chan = 0;
   6600 			}
   6601 			break;
   6602 		case TGSI_OPCODE_DP3:
   6603 			if (i > 2) {
   6604 				alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
   6605 				alu.src[0].chan = alu.src[1].chan = 0;
   6606 			}
   6607 			break;
   6608 		case TGSI_OPCODE_DPH:
   6609 			if (i == 3) {
   6610 				alu.src[0].sel = V_SQ_ALU_SRC_1;
   6611 				alu.src[0].chan = 0;
   6612 				alu.src[0].neg = 0;
   6613 			}
   6614 			break;
   6615 		default:
   6616 			break;
   6617 		}
   6618 		if (i == 3) {
   6619 			alu.last = 1;
   6620 		}
   6621 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6622 		if (r)
   6623 			return r;
   6624 	}
   6625 	return 0;
   6626 }
   6627 
   6628 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
   6629 						    unsigned index)
   6630 {
   6631 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6632 	return 	(inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
   6633 		inst->Src[index].Register.File != TGSI_FILE_INPUT &&
   6634 		inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
   6635 		ctx->src[index].neg || ctx->src[index].abs ||
   6636 		(inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
   6637 }
   6638 
   6639 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
   6640 					unsigned index)
   6641 {
   6642 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6643 	return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
   6644 }
   6645 
   6646 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
   6647 {
   6648 	struct r600_bytecode_vtx vtx;
   6649 	struct r600_bytecode_alu alu;
   6650 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6651 	int src_gpr, r, i;
   6652 	int id = tgsi_tex_get_src_gpr(ctx, 1);
   6653 
   6654 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
   6655 	if (src_requires_loading) {
   6656 		for (i = 0; i < 4; i++) {
   6657 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6658 			alu.op = ALU_OP1_MOV;
   6659 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   6660 			alu.dst.sel = ctx->temp_reg;
   6661 			alu.dst.chan = i;
   6662 			if (i == 3)
   6663 				alu.last = 1;
   6664 			alu.dst.write = 1;
   6665 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   6666 			if (r)
   6667 				return r;
   6668 		}
   6669 		src_gpr = ctx->temp_reg;
   6670 	}
   6671 
   6672 	memset(&vtx, 0, sizeof(vtx));
   6673 	vtx.op = FETCH_OP_VFETCH;
   6674 	vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
   6675 	vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
   6676 	vtx.src_gpr = src_gpr;
   6677 	vtx.mega_fetch_count = 16;
   6678 	vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
   6679 	vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;		/* SEL_X */
   6680 	vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;		/* SEL_Y */
   6681 	vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;		/* SEL_Z */
   6682 	vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;		/* SEL_W */
   6683 	vtx.use_const_fields = 1;
   6684 
   6685 	if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
   6686 		return r;
   6687 
   6688 	if (ctx->bc->chip_class >= EVERGREEN)
   6689 		return 0;
   6690 
   6691 	for (i = 0; i < 4; i++) {
   6692 		int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   6693 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   6694 			continue;
   6695 
   6696 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6697 		alu.op = ALU_OP2_AND_INT;
   6698 
   6699 		alu.dst.chan = i;
   6700 		alu.dst.sel = vtx.dst_gpr;
   6701 		alu.dst.write = 1;
   6702 
   6703 		alu.src[0].sel = vtx.dst_gpr;
   6704 		alu.src[0].chan = i;
   6705 
   6706 		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
   6707 		alu.src[1].sel += (id * 2);
   6708 		alu.src[1].chan = i % 4;
   6709 		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
   6710 
   6711 		if (i == lasti)
   6712 			alu.last = 1;
   6713 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6714 		if (r)
   6715 			return r;
   6716 	}
   6717 
   6718 	if (inst->Dst[0].Register.WriteMask & 3) {
   6719 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6720 		alu.op = ALU_OP2_OR_INT;
   6721 
   6722 		alu.dst.chan = 3;
   6723 		alu.dst.sel = vtx.dst_gpr;
   6724 		alu.dst.write = 1;
   6725 
   6726 		alu.src[0].sel = vtx.dst_gpr;
   6727 		alu.src[0].chan = 3;
   6728 
   6729 		alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
   6730 		alu.src[1].chan = 0;
   6731 		alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
   6732 
   6733 		alu.last = 1;
   6734 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6735 		if (r)
   6736 			return r;
   6737 	}
   6738 	return 0;
   6739 }
   6740 
   6741 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
   6742 {
   6743 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6744 	struct r600_bytecode_alu alu;
   6745 	int r;
   6746 	int id = tgsi_tex_get_src_gpr(ctx, 1);
   6747 
   6748 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6749 	alu.op = ALU_OP1_MOV;
   6750 	alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
   6751 	if (ctx->bc->chip_class >= EVERGREEN) {
   6752 		/* channel 0 or 2 of each word */
   6753 		alu.src[0].sel += (id / 2);
   6754 		alu.src[0].chan = (id % 2) * 2;
   6755 	} else {
   6756 		/* r600 we have them at channel 2 of the second dword */
   6757 		alu.src[0].sel += (id * 2) + 1;
   6758 		alu.src[0].chan = 1;
   6759 	}
   6760 	alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
   6761 	tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
   6762 	alu.last = 1;
   6763 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   6764 	if (r)
   6765 		return r;
   6766 	return 0;
   6767 }
   6768 
   6769 static int tgsi_tex(struct r600_shader_ctx *ctx)
   6770 {
   6771 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   6772 	struct r600_bytecode_tex tex;
   6773 	struct r600_bytecode_alu alu;
   6774 	unsigned src_gpr;
   6775 	int r, i, j;
   6776 	int opcode;
   6777 	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
   6778 				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
   6779 				    (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
   6780 				     inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
   6781 
   6782 	bool txf_add_offsets = inst->Texture.NumOffsets &&
   6783 			     inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
   6784 			     inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
   6785 
   6786 	/* Texture fetch instructions can only use gprs as source.
   6787 	 * Also they cannot negate the source or take the absolute value */
   6788 	const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
   6789 					      inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
   6790                                               tgsi_tex_src_requires_loading(ctx, 0)) ||
   6791 					     read_compressed_msaa || txf_add_offsets;
   6792 
   6793 	boolean src_loaded = FALSE;
   6794 	unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
   6795 	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
   6796 	boolean has_txq_cube_array_z = false;
   6797 	unsigned sampler_index_mode;
   6798 
   6799 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
   6800 	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
   6801 	      inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
   6802 		if (inst->Dst[0].Register.WriteMask & 4) {
   6803 			ctx->shader->has_txq_cube_array_z_comp = true;
   6804 			has_txq_cube_array_z = true;
   6805 		}
   6806 
   6807 	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
   6808 	    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
   6809 	    inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
   6810 	    inst->Instruction.Opcode == TGSI_OPCODE_TG4)
   6811 		sampler_src_reg = 2;
   6812 
   6813 	/* TGSI moves the sampler to src reg 3 for TXD */
   6814 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
   6815 		sampler_src_reg = 3;
   6816 
   6817 	sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
   6818 
   6819 	src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
   6820 
   6821 	if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
   6822 		if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
   6823 			ctx->shader->uses_tex_buffers = true;
   6824 			return r600_do_buffer_txq(ctx);
   6825 		}
   6826 		else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
   6827 			if (ctx->bc->chip_class < EVERGREEN)
   6828 				ctx->shader->uses_tex_buffers = true;
   6829 			return do_vtx_fetch_inst(ctx, src_requires_loading);
   6830 		}
   6831 	}
   6832 
   6833 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
   6834 		int out_chan;
   6835 		/* Add perspective divide */
   6836 		if (ctx->bc->chip_class == CAYMAN) {
   6837 			out_chan = 2;
   6838 			for (i = 0; i < 3; i++) {
   6839 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6840 				alu.op = ALU_OP1_RECIP_IEEE;
   6841 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   6842 
   6843 				alu.dst.sel = ctx->temp_reg;
   6844 				alu.dst.chan = i;
   6845 				if (i == 2)
   6846 					alu.last = 1;
   6847 				if (out_chan == i)
   6848 					alu.dst.write = 1;
   6849 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   6850 				if (r)
   6851 					return r;
   6852 			}
   6853 
   6854 		} else {
   6855 			out_chan = 3;
   6856 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6857 			alu.op = ALU_OP1_RECIP_IEEE;
   6858 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   6859 
   6860 			alu.dst.sel = ctx->temp_reg;
   6861 			alu.dst.chan = out_chan;
   6862 			alu.last = 1;
   6863 			alu.dst.write = 1;
   6864 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   6865 			if (r)
   6866 				return r;
   6867 		}
   6868 
   6869 		for (i = 0; i < 3; i++) {
   6870 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6871 			alu.op = ALU_OP2_MUL;
   6872 			alu.src[0].sel = ctx->temp_reg;
   6873 			alu.src[0].chan = out_chan;
   6874 			r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   6875 			alu.dst.sel = ctx->temp_reg;
   6876 			alu.dst.chan = i;
   6877 			alu.dst.write = 1;
   6878 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   6879 			if (r)
   6880 				return r;
   6881 		}
   6882 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6883 		alu.op = ALU_OP1_MOV;
   6884 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   6885 		alu.src[0].chan = 0;
   6886 		alu.dst.sel = ctx->temp_reg;
   6887 		alu.dst.chan = 3;
   6888 		alu.last = 1;
   6889 		alu.dst.write = 1;
   6890 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6891 		if (r)
   6892 			return r;
   6893 		src_loaded = TRUE;
   6894 		src_gpr = ctx->temp_reg;
   6895 	}
   6896 
   6897 
   6898 	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
   6899 	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
   6900 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
   6901 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
   6902 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
   6903 	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
   6904 
   6905 		static const unsigned src0_swizzle[] = {2, 2, 0, 1};
   6906 		static const unsigned src1_swizzle[] = {1, 0, 2, 2};
   6907 
   6908 		/* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
   6909 		for (i = 0; i < 4; i++) {
   6910 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6911 			alu.op = ALU_OP2_CUBE;
   6912 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
   6913 			r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
   6914 			alu.dst.sel = ctx->temp_reg;
   6915 			alu.dst.chan = i;
   6916 			if (i == 3)
   6917 				alu.last = 1;
   6918 			alu.dst.write = 1;
   6919 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   6920 			if (r)
   6921 				return r;
   6922 		}
   6923 
   6924 		/* tmp1.z = RCP_e(|tmp1.z|) */
   6925 		if (ctx->bc->chip_class == CAYMAN) {
   6926 			for (i = 0; i < 3; i++) {
   6927 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6928 				alu.op = ALU_OP1_RECIP_IEEE;
   6929 				alu.src[0].sel = ctx->temp_reg;
   6930 				alu.src[0].chan = 2;
   6931 				alu.src[0].abs = 1;
   6932 				alu.dst.sel = ctx->temp_reg;
   6933 				alu.dst.chan = i;
   6934 				if (i == 2)
   6935 					alu.dst.write = 1;
   6936 				if (i == 2)
   6937 					alu.last = 1;
   6938 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   6939 				if (r)
   6940 					return r;
   6941 			}
   6942 		} else {
   6943 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6944 			alu.op = ALU_OP1_RECIP_IEEE;
   6945 			alu.src[0].sel = ctx->temp_reg;
   6946 			alu.src[0].chan = 2;
   6947 			alu.src[0].abs = 1;
   6948 			alu.dst.sel = ctx->temp_reg;
   6949 			alu.dst.chan = 2;
   6950 			alu.dst.write = 1;
   6951 			alu.last = 1;
   6952 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   6953 			if (r)
   6954 				return r;
   6955 		}
   6956 
   6957 		/* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
   6958 		 * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
   6959 		 * muladd has no writemask, have to use another temp
   6960 		 */
   6961 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6962 		alu.op = ALU_OP3_MULADD;
   6963 		alu.is_op3 = 1;
   6964 
   6965 		alu.src[0].sel = ctx->temp_reg;
   6966 		alu.src[0].chan = 0;
   6967 		alu.src[1].sel = ctx->temp_reg;
   6968 		alu.src[1].chan = 2;
   6969 
   6970 		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
   6971 		alu.src[2].chan = 0;
   6972 		alu.src[2].value = u_bitcast_f2u(1.5f);
   6973 
   6974 		alu.dst.sel = ctx->temp_reg;
   6975 		alu.dst.chan = 0;
   6976 		alu.dst.write = 1;
   6977 
   6978 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   6979 		if (r)
   6980 			return r;
   6981 
   6982 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   6983 		alu.op = ALU_OP3_MULADD;
   6984 		alu.is_op3 = 1;
   6985 
   6986 		alu.src[0].sel = ctx->temp_reg;
   6987 		alu.src[0].chan = 1;
   6988 		alu.src[1].sel = ctx->temp_reg;
   6989 		alu.src[1].chan = 2;
   6990 
   6991 		alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
   6992 		alu.src[2].chan = 0;
   6993 		alu.src[2].value = u_bitcast_f2u(1.5f);
   6994 
   6995 		alu.dst.sel = ctx->temp_reg;
   6996 		alu.dst.chan = 1;
   6997 		alu.dst.write = 1;
   6998 
   6999 		alu.last = 1;
   7000 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7001 		if (r)
   7002 			return r;
   7003 		/* write initial compare value into Z component
   7004 		  - W src 0 for shadow cube
   7005 		  - X src 1 for shadow cube array */
   7006 		if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
   7007 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
   7008 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7009 			alu.op = ALU_OP1_MOV;
   7010 			if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
   7011 				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
   7012 			else
   7013 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   7014 			alu.dst.sel = ctx->temp_reg;
   7015 			alu.dst.chan = 2;
   7016 			alu.dst.write = 1;
   7017 			alu.last = 1;
   7018 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   7019 			if (r)
   7020 				return r;
   7021 		}
   7022 
   7023 		if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
   7024 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
   7025 			if (ctx->bc->chip_class >= EVERGREEN) {
   7026 				int mytmp = r600_get_temp(ctx);
   7027 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7028 				alu.op = ALU_OP1_MOV;
   7029 				alu.src[0].sel = ctx->temp_reg;
   7030 				alu.src[0].chan = 3;
   7031 				alu.dst.sel = mytmp;
   7032 				alu.dst.chan = 0;
   7033 				alu.dst.write = 1;
   7034 				alu.last = 1;
   7035 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   7036 				if (r)
   7037 					return r;
   7038 
   7039 				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
   7040 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7041 				alu.op = ALU_OP3_MULADD;
   7042 				alu.is_op3 = 1;
   7043 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   7044 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   7045 				alu.src[1].chan = 0;
   7046 				alu.src[1].value = u_bitcast_f2u(8.0f);
   7047 				alu.src[2].sel = mytmp;
   7048 				alu.src[2].chan = 0;
   7049 				alu.dst.sel = ctx->temp_reg;
   7050 				alu.dst.chan = 3;
   7051 				alu.dst.write = 1;
   7052 				alu.last = 1;
   7053 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   7054 				if (r)
   7055 					return r;
   7056 			} else if (ctx->bc->chip_class < EVERGREEN) {
   7057 				memset(&tex, 0, sizeof(struct r600_bytecode_tex));
   7058 				tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
   7059 				tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
   7060 				tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
   7061 				tex.src_gpr = r600_get_temp(ctx);
   7062 				tex.src_sel_x = 0;
   7063 				tex.src_sel_y = 0;
   7064 				tex.src_sel_z = 0;
   7065 				tex.src_sel_w = 0;
   7066 				tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
   7067 				tex.coord_type_x = 1;
   7068 				tex.coord_type_y = 1;
   7069 				tex.coord_type_z = 1;
   7070 				tex.coord_type_w = 1;
   7071 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7072 				alu.op = ALU_OP1_MOV;
   7073 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   7074 				alu.dst.sel = tex.src_gpr;
   7075 				alu.dst.chan = 0;
   7076 				alu.last = 1;
   7077 				alu.dst.write = 1;
   7078 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   7079 				if (r)
   7080 					return r;
   7081 
   7082 				r = r600_bytecode_add_tex(ctx->bc, &tex);
   7083 				if (r)
   7084 					return r;
   7085 			}
   7086 
   7087 		}
   7088 
   7089 		/* for cube forms of lod and bias we need to route things */
   7090 		if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
   7091 		    inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
   7092 		    inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
   7093 		    inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
   7094 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7095 			alu.op = ALU_OP1_MOV;
   7096 			if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
   7097 			    inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
   7098 				r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
   7099 			else
   7100 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
   7101 			alu.dst.sel = ctx->temp_reg;
   7102 			alu.dst.chan = 2;
   7103 			alu.last = 1;
   7104 			alu.dst.write = 1;
   7105 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   7106 			if (r)
   7107 				return r;
   7108 		}
   7109 
   7110 		src_loaded = TRUE;
   7111 		src_gpr = ctx->temp_reg;
   7112 	}
   7113 
   7114 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
   7115 		int temp_h = 0, temp_v = 0;
   7116 		int start_val = 0;
   7117 
   7118 		/* if we've already loaded the src (i.e. CUBE don't reload it). */
   7119 		if (src_loaded == TRUE)
   7120 			start_val = 1;
   7121 		else
   7122 			src_loaded = TRUE;
   7123 		for (i = start_val; i < 3; i++) {
   7124 			int treg = r600_get_temp(ctx);
   7125 
   7126 			if (i == 0)
   7127 				src_gpr = treg;
   7128 			else if (i == 1)
   7129 				temp_h = treg;
   7130 			else
   7131 				temp_v = treg;
   7132 
   7133 			for (j = 0; j < 4; j++) {
   7134 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7135 				alu.op = ALU_OP1_MOV;
   7136                                 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
   7137                                 alu.dst.sel = treg;
   7138                                 alu.dst.chan = j;
   7139                                 if (j == 3)
   7140                                    alu.last = 1;
   7141                                 alu.dst.write = 1;
   7142                                 r = r600_bytecode_add_alu(ctx->bc, &alu);
   7143                                 if (r)
   7144                                     return r;
   7145 			}
   7146 		}
   7147 		for (i = 1; i < 3; i++) {
   7148 			/* set gradients h/v */
   7149 			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
   7150 			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
   7151 				FETCH_OP_SET_GRADIENTS_V;
   7152 			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
   7153 			tex.sampler_index_mode = sampler_index_mode;
   7154 			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
   7155 			tex.resource_index_mode = sampler_index_mode;
   7156 
   7157 			tex.src_gpr = (i == 1) ? temp_h : temp_v;
   7158 			tex.src_sel_x = 0;
   7159 			tex.src_sel_y = 1;
   7160 			tex.src_sel_z = 2;
   7161 			tex.src_sel_w = 3;
   7162 
   7163 			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
   7164 			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
   7165 			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
   7166 				tex.coord_type_x = 1;
   7167 				tex.coord_type_y = 1;
   7168 				tex.coord_type_z = 1;
   7169 				tex.coord_type_w = 1;
   7170 			}
   7171 			r = r600_bytecode_add_tex(ctx->bc, &tex);
   7172 			if (r)
   7173 				return r;
   7174 		}
   7175 	}
   7176 
   7177 	if (src_requires_loading && !src_loaded) {
   7178 		for (i = 0; i < 4; i++) {
   7179 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7180 			alu.op = ALU_OP1_MOV;
   7181 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   7182 			alu.dst.sel = ctx->temp_reg;
   7183 			alu.dst.chan = i;
   7184 			if (i == 3)
   7185 				alu.last = 1;
   7186 			alu.dst.write = 1;
   7187 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   7188 			if (r)
   7189 				return r;
   7190 		}
   7191 		src_loaded = TRUE;
   7192 		src_gpr = ctx->temp_reg;
   7193 	}
   7194 
   7195 	/* get offset values */
   7196 	if (inst->Texture.NumOffsets) {
   7197 		assert(inst->Texture.NumOffsets == 1);
   7198 
   7199 		/* The texture offset feature doesn't work with the TXF instruction
   7200 		 * and must be emulated by adding the offset to the texture coordinates. */
   7201 		if (txf_add_offsets) {
   7202 			const struct tgsi_texture_offset *off = inst->TexOffsets;
   7203 
   7204 			switch (inst->Texture.Texture) {
   7205 			case TGSI_TEXTURE_3D:
   7206 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7207 				alu.op = ALU_OP2_ADD_INT;
   7208 				alu.src[0].sel = src_gpr;
   7209 				alu.src[0].chan = 2;
   7210 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   7211 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
   7212 				alu.dst.sel = src_gpr;
   7213 				alu.dst.chan = 2;
   7214 				alu.dst.write = 1;
   7215 				alu.last = 1;
   7216 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   7217 				if (r)
   7218 					return r;
   7219 				/* fall through */
   7220 
   7221 			case TGSI_TEXTURE_2D:
   7222 			case TGSI_TEXTURE_SHADOW2D:
   7223 			case TGSI_TEXTURE_RECT:
   7224 			case TGSI_TEXTURE_SHADOWRECT:
   7225 			case TGSI_TEXTURE_2D_ARRAY:
   7226 			case TGSI_TEXTURE_SHADOW2D_ARRAY:
   7227 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7228 				alu.op = ALU_OP2_ADD_INT;
   7229 				alu.src[0].sel = src_gpr;
   7230 				alu.src[0].chan = 1;
   7231 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   7232 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
   7233 				alu.dst.sel = src_gpr;
   7234 				alu.dst.chan = 1;
   7235 				alu.dst.write = 1;
   7236 				alu.last = 1;
   7237 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   7238 				if (r)
   7239 					return r;
   7240 				/* fall through */
   7241 
   7242 			case TGSI_TEXTURE_1D:
   7243 			case TGSI_TEXTURE_SHADOW1D:
   7244 			case TGSI_TEXTURE_1D_ARRAY:
   7245 			case TGSI_TEXTURE_SHADOW1D_ARRAY:
   7246 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7247 				alu.op = ALU_OP2_ADD_INT;
   7248 				alu.src[0].sel = src_gpr;
   7249 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   7250 				alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
   7251 				alu.dst.sel = src_gpr;
   7252 				alu.dst.write = 1;
   7253 				alu.last = 1;
   7254 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   7255 				if (r)
   7256 					return r;
   7257 				break;
   7258 				/* texture offsets do not apply to other texture targets */
   7259 			}
   7260 		} else {
   7261 			switch (inst->Texture.Texture) {
   7262 			case TGSI_TEXTURE_3D:
   7263 				offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
   7264 				/* fallthrough */
   7265 			case TGSI_TEXTURE_2D:
   7266 			case TGSI_TEXTURE_SHADOW2D:
   7267 			case TGSI_TEXTURE_RECT:
   7268 			case TGSI_TEXTURE_SHADOWRECT:
   7269 			case TGSI_TEXTURE_2D_ARRAY:
   7270 			case TGSI_TEXTURE_SHADOW2D_ARRAY:
   7271 				offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
   7272 				/* fallthrough */
   7273 			case TGSI_TEXTURE_1D:
   7274 			case TGSI_TEXTURE_SHADOW1D:
   7275 			case TGSI_TEXTURE_1D_ARRAY:
   7276 			case TGSI_TEXTURE_SHADOW1D_ARRAY:
   7277 				offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
   7278 			}
   7279 		}
   7280 	}
   7281 
   7282 	/* Obtain the sample index for reading a compressed MSAA color texture.
   7283 	 * To read the FMASK, we use the ldfptr instruction, which tells us
   7284 	 * where the samples are stored.
   7285 	 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
   7286 	 * which is the identity mapping. Each nibble says which physical sample
   7287 	 * should be fetched to get that sample.
   7288 	 *
   7289 	 * Assume src.z contains the sample index. It should be modified like this:
   7290 	 *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
   7291 	 * Then fetch the texel with src.
   7292 	 */
   7293 	if (read_compressed_msaa) {
   7294 		unsigned sample_chan = 3;
   7295 		unsigned temp = r600_get_temp(ctx);
   7296 		assert(src_loaded);
   7297 
   7298 		/* temp.w = ldfptr() */
   7299 		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
   7300 		tex.op = FETCH_OP_LD;
   7301 		tex.inst_mod = 1; /* to indicate this is ldfptr */
   7302 		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
   7303 		tex.sampler_index_mode = sampler_index_mode;
   7304 		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
   7305 		tex.resource_index_mode = sampler_index_mode;
   7306 		tex.src_gpr = src_gpr;
   7307 		tex.dst_gpr = temp;
   7308 		tex.dst_sel_x = 7; /* mask out these components */
   7309 		tex.dst_sel_y = 7;
   7310 		tex.dst_sel_z = 7;
   7311 		tex.dst_sel_w = 0; /* store X */
   7312 		tex.src_sel_x = 0;
   7313 		tex.src_sel_y = 1;
   7314 		tex.src_sel_z = 2;
   7315 		tex.src_sel_w = 3;
   7316 		tex.offset_x = offset_x;
   7317 		tex.offset_y = offset_y;
   7318 		tex.offset_z = offset_z;
   7319 		r = r600_bytecode_add_tex(ctx->bc, &tex);
   7320 		if (r)
   7321 			return r;
   7322 
   7323 		/* temp.x = sample_index*4 */
   7324 		if (ctx->bc->chip_class == CAYMAN) {
   7325 			for (i = 0 ; i < 4; i++) {
   7326 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7327 				alu.op = ALU_OP2_MULLO_INT;
   7328 				alu.src[0].sel = src_gpr;
   7329 				alu.src[0].chan = sample_chan;
   7330 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   7331 				alu.src[1].value = 4;
   7332 				alu.dst.sel = temp;
   7333 				alu.dst.chan = i;
   7334 				alu.dst.write = i == 0;
   7335 				if (i == 3)
   7336 					alu.last = 1;
   7337 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   7338 				if (r)
   7339 					return r;
   7340 			}
   7341 		} else {
   7342 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7343 			alu.op = ALU_OP2_MULLO_INT;
   7344 			alu.src[0].sel = src_gpr;
   7345 			alu.src[0].chan = sample_chan;
   7346 			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   7347 			alu.src[1].value = 4;
   7348 			alu.dst.sel = temp;
   7349 			alu.dst.chan = 0;
   7350 			alu.dst.write = 1;
   7351 			alu.last = 1;
   7352 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   7353 			if (r)
   7354 				return r;
   7355 		}
   7356 
   7357 		/* sample_index = temp.w >> temp.x */
   7358 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7359 		alu.op = ALU_OP2_LSHR_INT;
   7360 		alu.src[0].sel = temp;
   7361 		alu.src[0].chan = 3;
   7362 		alu.src[1].sel = temp;
   7363 		alu.src[1].chan = 0;
   7364 		alu.dst.sel = src_gpr;
   7365 		alu.dst.chan = sample_chan;
   7366 		alu.dst.write = 1;
   7367 		alu.last = 1;
   7368 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7369 		if (r)
   7370 			return r;
   7371 
   7372 		/* sample_index & 0xF */
   7373 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7374 		alu.op = ALU_OP2_AND_INT;
   7375 		alu.src[0].sel = src_gpr;
   7376 		alu.src[0].chan = sample_chan;
   7377 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   7378 		alu.src[1].value = 0xF;
   7379 		alu.dst.sel = src_gpr;
   7380 		alu.dst.chan = sample_chan;
   7381 		alu.dst.write = 1;
   7382 		alu.last = 1;
   7383 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7384 		if (r)
   7385 			return r;
   7386 #if 0
   7387 		/* visualize the FMASK */
   7388 		for (i = 0; i < 4; i++) {
   7389 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7390 			alu.op = ALU_OP1_INT_TO_FLT;
   7391 			alu.src[0].sel = src_gpr;
   7392 			alu.src[0].chan = sample_chan;
   7393 			alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
   7394 			alu.dst.chan = i;
   7395 			alu.dst.write = 1;
   7396 			alu.last = 1;
   7397 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   7398 			if (r)
   7399 				return r;
   7400 		}
   7401 		return 0;
   7402 #endif
   7403 	}
   7404 
   7405 	/* does this shader want a num layers from TXQ for a cube array? */
   7406 	if (has_txq_cube_array_z) {
   7407 		int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
   7408 
   7409 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7410 		alu.op = ALU_OP1_MOV;
   7411 
   7412 		alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
   7413 		if (ctx->bc->chip_class >= EVERGREEN) {
   7414 			/* channel 1 or 3 of each word */
   7415 			alu.src[0].sel += (id / 2);
   7416 			alu.src[0].chan = ((id % 2) * 2) + 1;
   7417 		} else {
   7418 			/* r600 we have them at channel 2 of the second dword */
   7419 			alu.src[0].sel += (id * 2) + 1;
   7420 			alu.src[0].chan = 2;
   7421 		}
   7422 		alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
   7423 		tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
   7424 		alu.last = 1;
   7425 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7426 		if (r)
   7427 			return r;
   7428 		/* disable writemask from texture instruction */
   7429 		inst->Dst[0].Register.WriteMask &= ~4;
   7430 	}
   7431 
   7432 	opcode = ctx->inst_info->op;
   7433 	if (opcode == FETCH_OP_GATHER4 &&
   7434 		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
   7435 		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
   7436 		opcode = FETCH_OP_GATHER4_O;
   7437 
   7438 		/* GATHER4_O/GATHER4_C_O use offset values loaded by
   7439 		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
   7440 		   encoded in the instruction are ignored. */
   7441 		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
   7442 		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
   7443 		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
   7444 		tex.sampler_index_mode = sampler_index_mode;
   7445 		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
   7446 		tex.resource_index_mode = sampler_index_mode;
   7447 
   7448 		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
   7449 		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
   7450 		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
   7451 		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
   7452 		tex.src_sel_w = 4;
   7453 
   7454 		tex.dst_sel_x = 7;
   7455 		tex.dst_sel_y = 7;
   7456 		tex.dst_sel_z = 7;
   7457 		tex.dst_sel_w = 7;
   7458 
   7459 		r = r600_bytecode_add_tex(ctx->bc, &tex);
   7460 		if (r)
   7461 			return r;
   7462 	}
   7463 
   7464 	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
   7465 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
   7466 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
   7467 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
   7468 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
   7469 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
   7470 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
   7471 		switch (opcode) {
   7472 		case FETCH_OP_SAMPLE:
   7473 			opcode = FETCH_OP_SAMPLE_C;
   7474 			break;
   7475 		case FETCH_OP_SAMPLE_L:
   7476 			opcode = FETCH_OP_SAMPLE_C_L;
   7477 			break;
   7478 		case FETCH_OP_SAMPLE_LB:
   7479 			opcode = FETCH_OP_SAMPLE_C_LB;
   7480 			break;
   7481 		case FETCH_OP_SAMPLE_G:
   7482 			opcode = FETCH_OP_SAMPLE_C_G;
   7483 			break;
   7484 		/* Texture gather variants */
   7485 		case FETCH_OP_GATHER4:
   7486 			opcode = FETCH_OP_GATHER4_C;
   7487 			break;
   7488 		case FETCH_OP_GATHER4_O:
   7489 			opcode = FETCH_OP_GATHER4_C_O;
   7490 			break;
   7491 		}
   7492 	}
   7493 
   7494 	memset(&tex, 0, sizeof(struct r600_bytecode_tex));
   7495 	tex.op = opcode;
   7496 
   7497 	tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
   7498 	tex.sampler_index_mode = sampler_index_mode;
   7499 	tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
   7500 	tex.resource_index_mode = sampler_index_mode;
   7501 	tex.src_gpr = src_gpr;
   7502 	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
   7503 
   7504 	if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
   7505 		inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
   7506 		tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
   7507 	}
   7508 
   7509 	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
   7510 		int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
   7511 		tex.inst_mod = texture_component_select;
   7512 
   7513 		if (ctx->bc->chip_class == CAYMAN) {
   7514 		/* GATHER4 result order is different from TGSI TG4 */
   7515 			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
   7516 			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
   7517 			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
   7518 			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
   7519 		} else {
   7520 			tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
   7521 			tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
   7522 			tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
   7523 			tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
   7524 		}
   7525 	}
   7526 	else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
   7527 		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
   7528 		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
   7529 		tex.dst_sel_z = 7;
   7530 		tex.dst_sel_w = 7;
   7531 	}
   7532 	else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
   7533 		tex.dst_sel_x = 3;
   7534 		tex.dst_sel_y = 7;
   7535 		tex.dst_sel_z = 7;
   7536 		tex.dst_sel_w = 7;
   7537 	}
   7538 	else {
   7539 		tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
   7540 		tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
   7541 		tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
   7542 		tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
   7543 	}
   7544 
   7545 
   7546 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
   7547 	    inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
   7548 		tex.src_sel_x = 4;
   7549 		tex.src_sel_y = 4;
   7550 		tex.src_sel_z = 4;
   7551 		tex.src_sel_w = 4;
   7552 	} else if (src_loaded) {
   7553 		tex.src_sel_x = 0;
   7554 		tex.src_sel_y = 1;
   7555 		tex.src_sel_z = 2;
   7556 		tex.src_sel_w = 3;
   7557 	} else {
   7558 		tex.src_sel_x = ctx->src[0].swizzle[0];
   7559 		tex.src_sel_y = ctx->src[0].swizzle[1];
   7560 		tex.src_sel_z = ctx->src[0].swizzle[2];
   7561 		tex.src_sel_w = ctx->src[0].swizzle[3];
   7562 		tex.src_rel = ctx->src[0].rel;
   7563 	}
   7564 
   7565 	if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
   7566 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
   7567 	    inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
   7568 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
   7569 		tex.src_sel_x = 1;
   7570 		tex.src_sel_y = 0;
   7571 		tex.src_sel_z = 3;
   7572 		tex.src_sel_w = 2; /* route Z compare or Lod value into W */
   7573 	}
   7574 
   7575 	if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
   7576 	    inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
   7577 		tex.coord_type_x = 1;
   7578 		tex.coord_type_y = 1;
   7579 	}
   7580 	tex.coord_type_z = 1;
   7581 	tex.coord_type_w = 1;
   7582 
   7583 	tex.offset_x = offset_x;
   7584 	tex.offset_y = offset_y;
   7585 	if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
   7586 		(inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
   7587 		 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
   7588 		tex.offset_z = 0;
   7589 	}
   7590 	else {
   7591 		tex.offset_z = offset_z;
   7592 	}
   7593 
   7594 	/* Put the depth for comparison in W.
   7595 	 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
   7596 	 * Some instructions expect the depth in Z. */
   7597 	if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
   7598 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
   7599 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
   7600 	     inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
   7601 	    opcode != FETCH_OP_SAMPLE_C_L &&
   7602 	    opcode != FETCH_OP_SAMPLE_C_LB) {
   7603 		tex.src_sel_w = tex.src_sel_z;
   7604 	}
   7605 
   7606 	if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
   7607 	    inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
   7608 		if (opcode == FETCH_OP_SAMPLE_C_L ||
   7609 		    opcode == FETCH_OP_SAMPLE_C_LB) {
   7610 			/* the array index is read from Y */
   7611 			tex.coord_type_y = 0;
   7612 		} else {
   7613 			/* the array index is read from Z */
   7614 			tex.coord_type_z = 0;
   7615 			tex.src_sel_z = tex.src_sel_y;
   7616 		}
   7617 	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
   7618 		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
   7619 		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
   7620 		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
   7621 		    (ctx->bc->chip_class >= EVERGREEN)))
   7622 		/* the array index is read from Z */
   7623 		tex.coord_type_z = 0;
   7624 
   7625 	/* mask unused source components */
   7626 	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
   7627 		switch (inst->Texture.Texture) {
   7628 		case TGSI_TEXTURE_2D:
   7629 		case TGSI_TEXTURE_RECT:
   7630 			tex.src_sel_z = 7;
   7631 			tex.src_sel_w = 7;
   7632 			break;
   7633 		case TGSI_TEXTURE_1D_ARRAY:
   7634 			tex.src_sel_y = 7;
   7635 			tex.src_sel_w = 7;
   7636 			break;
   7637 		case TGSI_TEXTURE_1D:
   7638 			tex.src_sel_y = 7;
   7639 			tex.src_sel_z = 7;
   7640 			tex.src_sel_w = 7;
   7641 			break;
   7642 		}
   7643 	}
   7644 
   7645 	r = r600_bytecode_add_tex(ctx->bc, &tex);
   7646 	if (r)
   7647 		return r;
   7648 
   7649 	/* add shadow ambient support  - gallium doesn't do it yet */
   7650 	return 0;
   7651 }
   7652 
   7653 static int tgsi_lrp(struct r600_shader_ctx *ctx)
   7654 {
   7655 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   7656 	struct r600_bytecode_alu alu;
   7657 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   7658 	unsigned i, temp_regs[2];
   7659 	int r;
   7660 
   7661 	/* optimize if it's just an equal balance */
   7662 	if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
   7663 		for (i = 0; i < lasti + 1; i++) {
   7664 			if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   7665 				continue;
   7666 
   7667 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7668 			alu.op = ALU_OP2_ADD;
   7669 			r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
   7670 			r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   7671 			alu.omod = 3;
   7672 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   7673 			alu.dst.chan = i;
   7674 			if (i == lasti) {
   7675 				alu.last = 1;
   7676 			}
   7677 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   7678 			if (r)
   7679 				return r;
   7680 		}
   7681 		return 0;
   7682 	}
   7683 
   7684 	/* 1 - src0 */
   7685 	for (i = 0; i < lasti + 1; i++) {
   7686 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   7687 			continue;
   7688 
   7689 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7690 		alu.op = ALU_OP2_ADD;
   7691 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   7692 		alu.src[0].chan = 0;
   7693 		r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
   7694 		r600_bytecode_src_toggle_neg(&alu.src[1]);
   7695 		alu.dst.sel = ctx->temp_reg;
   7696 		alu.dst.chan = i;
   7697 		if (i == lasti) {
   7698 			alu.last = 1;
   7699 		}
   7700 		alu.dst.write = 1;
   7701 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7702 		if (r)
   7703 			return r;
   7704 	}
   7705 
   7706 	/* (1 - src0) * src2 */
   7707 	for (i = 0; i < lasti + 1; i++) {
   7708 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   7709 			continue;
   7710 
   7711 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7712 		alu.op = ALU_OP2_MUL;
   7713 		alu.src[0].sel = ctx->temp_reg;
   7714 		alu.src[0].chan = i;
   7715 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   7716 		alu.dst.sel = ctx->temp_reg;
   7717 		alu.dst.chan = i;
   7718 		if (i == lasti) {
   7719 			alu.last = 1;
   7720 		}
   7721 		alu.dst.write = 1;
   7722 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7723 		if (r)
   7724 			return r;
   7725 	}
   7726 
   7727 	/* src0 * src1 + (1 - src0) * src2 */
   7728         if (ctx->src[0].abs)
   7729 		temp_regs[0] = r600_get_temp(ctx);
   7730 	else
   7731 		temp_regs[0] = 0;
   7732 	if (ctx->src[1].abs)
   7733 		temp_regs[1] = r600_get_temp(ctx);
   7734 	else
   7735 		temp_regs[1] = 0;
   7736 
   7737 	for (i = 0; i < lasti + 1; i++) {
   7738 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   7739 			continue;
   7740 
   7741 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7742 		alu.op = ALU_OP3_MULADD;
   7743 		alu.is_op3 = 1;
   7744 		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
   7745 		if (r)
   7746 			return r;
   7747 		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
   7748 		if (r)
   7749 			return r;
   7750 		alu.src[2].sel = ctx->temp_reg;
   7751 		alu.src[2].chan = i;
   7752 
   7753 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   7754 		alu.dst.chan = i;
   7755 		if (i == lasti) {
   7756 			alu.last = 1;
   7757 		}
   7758 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7759 		if (r)
   7760 			return r;
   7761 	}
   7762 	return 0;
   7763 }
   7764 
   7765 static int tgsi_cmp(struct r600_shader_ctx *ctx)
   7766 {
   7767 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   7768 	struct r600_bytecode_alu alu;
   7769 	int i, r, j;
   7770 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   7771 	int temp_regs[3];
   7772 	unsigned op;
   7773 
   7774 	if (ctx->src[0].abs && ctx->src[0].neg) {
   7775 		op = ALU_OP3_CNDE;
   7776 		ctx->src[0].abs = 0;
   7777 		ctx->src[0].neg = 0;
   7778 	} else {
   7779 		op = ALU_OP3_CNDGE;
   7780 	}
   7781 
   7782 	for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
   7783 		temp_regs[j] = 0;
   7784 		if (ctx->src[j].abs)
   7785 			temp_regs[j] = r600_get_temp(ctx);
   7786 	}
   7787 
   7788 	for (i = 0; i < lasti + 1; i++) {
   7789 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   7790 			continue;
   7791 
   7792 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7793 		alu.op = op;
   7794 		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
   7795 		if (r)
   7796 			return r;
   7797 		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
   7798 		if (r)
   7799 			return r;
   7800 		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
   7801 		if (r)
   7802 			return r;
   7803 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   7804 		alu.dst.chan = i;
   7805 		alu.dst.write = 1;
   7806 		alu.is_op3 = 1;
   7807 		if (i == lasti)
   7808 			alu.last = 1;
   7809 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7810 		if (r)
   7811 			return r;
   7812 	}
   7813 	return 0;
   7814 }
   7815 
   7816 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
   7817 {
   7818 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   7819 	struct r600_bytecode_alu alu;
   7820 	int i, r;
   7821 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   7822 
   7823 	for (i = 0; i < lasti + 1; i++) {
   7824 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   7825 			continue;
   7826 
   7827 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7828 		alu.op = ALU_OP3_CNDE_INT;
   7829 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   7830 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   7831 		r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
   7832 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   7833 		alu.dst.chan = i;
   7834 		alu.dst.write = 1;
   7835 		alu.is_op3 = 1;
   7836 		if (i == lasti)
   7837 			alu.last = 1;
   7838 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7839 		if (r)
   7840 			return r;
   7841 	}
   7842 	return 0;
   7843 }
   7844 
   7845 static int tgsi_xpd(struct r600_shader_ctx *ctx)
   7846 {
   7847 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   7848 	static const unsigned int src0_swizzle[] = {2, 0, 1};
   7849 	static const unsigned int src1_swizzle[] = {1, 2, 0};
   7850 	struct r600_bytecode_alu alu;
   7851 	uint32_t use_temp = 0;
   7852 	int i, r;
   7853 
   7854 	if (inst->Dst[0].Register.WriteMask != 0xf)
   7855 		use_temp = 1;
   7856 
   7857 	for (i = 0; i < 4; i++) {
   7858 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7859 		alu.op = ALU_OP2_MUL;
   7860 		if (i < 3) {
   7861 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
   7862 			r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
   7863 		} else {
   7864 			alu.src[0].sel = V_SQ_ALU_SRC_0;
   7865 			alu.src[0].chan = i;
   7866 			alu.src[1].sel = V_SQ_ALU_SRC_0;
   7867 			alu.src[1].chan = i;
   7868 		}
   7869 
   7870 		alu.dst.sel = ctx->temp_reg;
   7871 		alu.dst.chan = i;
   7872 		alu.dst.write = 1;
   7873 
   7874 		if (i == 3)
   7875 			alu.last = 1;
   7876 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7877 		if (r)
   7878 			return r;
   7879 	}
   7880 
   7881 	for (i = 0; i < 4; i++) {
   7882 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7883 		alu.op = ALU_OP3_MULADD;
   7884 
   7885 		if (i < 3) {
   7886 			r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
   7887 			r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
   7888 		} else {
   7889 			alu.src[0].sel = V_SQ_ALU_SRC_0;
   7890 			alu.src[0].chan = i;
   7891 			alu.src[1].sel = V_SQ_ALU_SRC_0;
   7892 			alu.src[1].chan = i;
   7893 		}
   7894 
   7895 		alu.src[2].sel = ctx->temp_reg;
   7896 		alu.src[2].neg = 1;
   7897 		alu.src[2].chan = i;
   7898 
   7899 		if (use_temp)
   7900 			alu.dst.sel = ctx->temp_reg;
   7901 		else
   7902 			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   7903 		alu.dst.chan = i;
   7904 		alu.dst.write = 1;
   7905 		alu.is_op3 = 1;
   7906 		if (i == 3)
   7907 			alu.last = 1;
   7908 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7909 		if (r)
   7910 			return r;
   7911 	}
   7912 	if (use_temp)
   7913 		return tgsi_helper_copy(ctx, inst);
   7914 	return 0;
   7915 }
   7916 
   7917 static int tgsi_exp(struct r600_shader_ctx *ctx)
   7918 {
   7919 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   7920 	struct r600_bytecode_alu alu;
   7921 	int r;
   7922 	unsigned i;
   7923 
   7924 	/* result.x = 2^floor(src); */
   7925 	if (inst->Dst[0].Register.WriteMask & 1) {
   7926 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7927 
   7928 		alu.op = ALU_OP1_FLOOR;
   7929 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   7930 
   7931 		alu.dst.sel = ctx->temp_reg;
   7932 		alu.dst.chan = 0;
   7933 		alu.dst.write = 1;
   7934 		alu.last = 1;
   7935 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7936 		if (r)
   7937 			return r;
   7938 
   7939 		if (ctx->bc->chip_class == CAYMAN) {
   7940 			for (i = 0; i < 3; i++) {
   7941 				alu.op = ALU_OP1_EXP_IEEE;
   7942 				alu.src[0].sel = ctx->temp_reg;
   7943 				alu.src[0].chan = 0;
   7944 
   7945 				alu.dst.sel = ctx->temp_reg;
   7946 				alu.dst.chan = i;
   7947 				alu.dst.write = i == 0;
   7948 				alu.last = i == 2;
   7949 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   7950 				if (r)
   7951 					return r;
   7952 			}
   7953 		} else {
   7954 			alu.op = ALU_OP1_EXP_IEEE;
   7955 			alu.src[0].sel = ctx->temp_reg;
   7956 			alu.src[0].chan = 0;
   7957 
   7958 			alu.dst.sel = ctx->temp_reg;
   7959 			alu.dst.chan = 0;
   7960 			alu.dst.write = 1;
   7961 			alu.last = 1;
   7962 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   7963 			if (r)
   7964 				return r;
   7965 		}
   7966 	}
   7967 
   7968 	/* result.y = tmp - floor(tmp); */
   7969 	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
   7970 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7971 
   7972 		alu.op = ALU_OP1_FRACT;
   7973 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   7974 
   7975 		alu.dst.sel = ctx->temp_reg;
   7976 #if 0
   7977 		r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   7978 		if (r)
   7979 			return r;
   7980 #endif
   7981 		alu.dst.write = 1;
   7982 		alu.dst.chan = 1;
   7983 
   7984 		alu.last = 1;
   7985 
   7986 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   7987 		if (r)
   7988 			return r;
   7989 	}
   7990 
   7991 	/* result.z = RoughApprox2ToX(tmp);*/
   7992 	if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
   7993 		if (ctx->bc->chip_class == CAYMAN) {
   7994 			for (i = 0; i < 3; i++) {
   7995 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   7996 				alu.op = ALU_OP1_EXP_IEEE;
   7997 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   7998 
   7999 				alu.dst.sel = ctx->temp_reg;
   8000 				alu.dst.chan = i;
   8001 				if (i == 2) {
   8002 					alu.dst.write = 1;
   8003 					alu.last = 1;
   8004 				}
   8005 
   8006 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   8007 				if (r)
   8008 					return r;
   8009 			}
   8010 		} else {
   8011 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8012 			alu.op = ALU_OP1_EXP_IEEE;
   8013 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8014 
   8015 			alu.dst.sel = ctx->temp_reg;
   8016 			alu.dst.write = 1;
   8017 			alu.dst.chan = 2;
   8018 
   8019 			alu.last = 1;
   8020 
   8021 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   8022 			if (r)
   8023 				return r;
   8024 		}
   8025 	}
   8026 
   8027 	/* result.w = 1.0;*/
   8028 	if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
   8029 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8030 
   8031 		alu.op = ALU_OP1_MOV;
   8032 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   8033 		alu.src[0].chan = 0;
   8034 
   8035 		alu.dst.sel = ctx->temp_reg;
   8036 		alu.dst.chan = 3;
   8037 		alu.dst.write = 1;
   8038 		alu.last = 1;
   8039 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8040 		if (r)
   8041 			return r;
   8042 	}
   8043 	return tgsi_helper_copy(ctx, inst);
   8044 }
   8045 
   8046 static int tgsi_log(struct r600_shader_ctx *ctx)
   8047 {
   8048 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   8049 	struct r600_bytecode_alu alu;
   8050 	int r;
   8051 	unsigned i;
   8052 
   8053 	/* result.x = floor(log2(|src|)); */
   8054 	if (inst->Dst[0].Register.WriteMask & 1) {
   8055 		if (ctx->bc->chip_class == CAYMAN) {
   8056 			for (i = 0; i < 3; i++) {
   8057 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8058 
   8059 				alu.op = ALU_OP1_LOG_IEEE;
   8060 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8061 				r600_bytecode_src_set_abs(&alu.src[0]);
   8062 
   8063 				alu.dst.sel = ctx->temp_reg;
   8064 				alu.dst.chan = i;
   8065 				if (i == 0)
   8066 					alu.dst.write = 1;
   8067 				if (i == 2)
   8068 					alu.last = 1;
   8069 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   8070 				if (r)
   8071 					return r;
   8072 			}
   8073 
   8074 		} else {
   8075 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8076 
   8077 			alu.op = ALU_OP1_LOG_IEEE;
   8078 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8079 			r600_bytecode_src_set_abs(&alu.src[0]);
   8080 
   8081 			alu.dst.sel = ctx->temp_reg;
   8082 			alu.dst.chan = 0;
   8083 			alu.dst.write = 1;
   8084 			alu.last = 1;
   8085 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   8086 			if (r)
   8087 				return r;
   8088 		}
   8089 
   8090 		alu.op = ALU_OP1_FLOOR;
   8091 		alu.src[0].sel = ctx->temp_reg;
   8092 		alu.src[0].chan = 0;
   8093 
   8094 		alu.dst.sel = ctx->temp_reg;
   8095 		alu.dst.chan = 0;
   8096 		alu.dst.write = 1;
   8097 		alu.last = 1;
   8098 
   8099 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8100 		if (r)
   8101 			return r;
   8102 	}
   8103 
   8104 	/* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
   8105 	if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
   8106 
   8107 		if (ctx->bc->chip_class == CAYMAN) {
   8108 			for (i = 0; i < 3; i++) {
   8109 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8110 
   8111 				alu.op = ALU_OP1_LOG_IEEE;
   8112 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8113 				r600_bytecode_src_set_abs(&alu.src[0]);
   8114 
   8115 				alu.dst.sel = ctx->temp_reg;
   8116 				alu.dst.chan = i;
   8117 				if (i == 1)
   8118 					alu.dst.write = 1;
   8119 				if (i == 2)
   8120 					alu.last = 1;
   8121 
   8122 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   8123 				if (r)
   8124 					return r;
   8125 			}
   8126 		} else {
   8127 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8128 
   8129 			alu.op = ALU_OP1_LOG_IEEE;
   8130 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8131 			r600_bytecode_src_set_abs(&alu.src[0]);
   8132 
   8133 			alu.dst.sel = ctx->temp_reg;
   8134 			alu.dst.chan = 1;
   8135 			alu.dst.write = 1;
   8136 			alu.last = 1;
   8137 
   8138 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   8139 			if (r)
   8140 				return r;
   8141 		}
   8142 
   8143 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8144 
   8145 		alu.op = ALU_OP1_FLOOR;
   8146 		alu.src[0].sel = ctx->temp_reg;
   8147 		alu.src[0].chan = 1;
   8148 
   8149 		alu.dst.sel = ctx->temp_reg;
   8150 		alu.dst.chan = 1;
   8151 		alu.dst.write = 1;
   8152 		alu.last = 1;
   8153 
   8154 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8155 		if (r)
   8156 			return r;
   8157 
   8158 		if (ctx->bc->chip_class == CAYMAN) {
   8159 			for (i = 0; i < 3; i++) {
   8160 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8161 				alu.op = ALU_OP1_EXP_IEEE;
   8162 				alu.src[0].sel = ctx->temp_reg;
   8163 				alu.src[0].chan = 1;
   8164 
   8165 				alu.dst.sel = ctx->temp_reg;
   8166 				alu.dst.chan = i;
   8167 				if (i == 1)
   8168 					alu.dst.write = 1;
   8169 				if (i == 2)
   8170 					alu.last = 1;
   8171 
   8172 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   8173 				if (r)
   8174 					return r;
   8175 			}
   8176 		} else {
   8177 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8178 			alu.op = ALU_OP1_EXP_IEEE;
   8179 			alu.src[0].sel = ctx->temp_reg;
   8180 			alu.src[0].chan = 1;
   8181 
   8182 			alu.dst.sel = ctx->temp_reg;
   8183 			alu.dst.chan = 1;
   8184 			alu.dst.write = 1;
   8185 			alu.last = 1;
   8186 
   8187 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   8188 			if (r)
   8189 				return r;
   8190 		}
   8191 
   8192 		if (ctx->bc->chip_class == CAYMAN) {
   8193 			for (i = 0; i < 3; i++) {
   8194 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8195 				alu.op = ALU_OP1_RECIP_IEEE;
   8196 				alu.src[0].sel = ctx->temp_reg;
   8197 				alu.src[0].chan = 1;
   8198 
   8199 				alu.dst.sel = ctx->temp_reg;
   8200 				alu.dst.chan = i;
   8201 				if (i == 1)
   8202 					alu.dst.write = 1;
   8203 				if (i == 2)
   8204 					alu.last = 1;
   8205 
   8206 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   8207 				if (r)
   8208 					return r;
   8209 			}
   8210 		} else {
   8211 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8212 			alu.op = ALU_OP1_RECIP_IEEE;
   8213 			alu.src[0].sel = ctx->temp_reg;
   8214 			alu.src[0].chan = 1;
   8215 
   8216 			alu.dst.sel = ctx->temp_reg;
   8217 			alu.dst.chan = 1;
   8218 			alu.dst.write = 1;
   8219 			alu.last = 1;
   8220 
   8221 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   8222 			if (r)
   8223 				return r;
   8224 		}
   8225 
   8226 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8227 
   8228 		alu.op = ALU_OP2_MUL;
   8229 
   8230 		r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8231 		r600_bytecode_src_set_abs(&alu.src[0]);
   8232 
   8233 		alu.src[1].sel = ctx->temp_reg;
   8234 		alu.src[1].chan = 1;
   8235 
   8236 		alu.dst.sel = ctx->temp_reg;
   8237 		alu.dst.chan = 1;
   8238 		alu.dst.write = 1;
   8239 		alu.last = 1;
   8240 
   8241 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8242 		if (r)
   8243 			return r;
   8244 	}
   8245 
   8246 	/* result.z = log2(|src|);*/
   8247 	if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
   8248 		if (ctx->bc->chip_class == CAYMAN) {
   8249 			for (i = 0; i < 3; i++) {
   8250 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8251 
   8252 				alu.op = ALU_OP1_LOG_IEEE;
   8253 				r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8254 				r600_bytecode_src_set_abs(&alu.src[0]);
   8255 
   8256 				alu.dst.sel = ctx->temp_reg;
   8257 				if (i == 2)
   8258 					alu.dst.write = 1;
   8259 				alu.dst.chan = i;
   8260 				if (i == 2)
   8261 					alu.last = 1;
   8262 
   8263 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   8264 				if (r)
   8265 					return r;
   8266 			}
   8267 		} else {
   8268 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8269 
   8270 			alu.op = ALU_OP1_LOG_IEEE;
   8271 			r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8272 			r600_bytecode_src_set_abs(&alu.src[0]);
   8273 
   8274 			alu.dst.sel = ctx->temp_reg;
   8275 			alu.dst.write = 1;
   8276 			alu.dst.chan = 2;
   8277 			alu.last = 1;
   8278 
   8279 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   8280 			if (r)
   8281 				return r;
   8282 		}
   8283 	}
   8284 
   8285 	/* result.w = 1.0; */
   8286 	if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
   8287 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8288 
   8289 		alu.op = ALU_OP1_MOV;
   8290 		alu.src[0].sel = V_SQ_ALU_SRC_1;
   8291 		alu.src[0].chan = 0;
   8292 
   8293 		alu.dst.sel = ctx->temp_reg;
   8294 		alu.dst.chan = 3;
   8295 		alu.dst.write = 1;
   8296 		alu.last = 1;
   8297 
   8298 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8299 		if (r)
   8300 			return r;
   8301 	}
   8302 
   8303 	return tgsi_helper_copy(ctx, inst);
   8304 }
   8305 
   8306 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
   8307 {
   8308 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   8309 	struct r600_bytecode_alu alu;
   8310 	int r;
   8311 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   8312 	unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
   8313 
   8314 	assert(inst->Dst[0].Register.Index < 3);
   8315 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8316 
   8317 	switch (inst->Instruction.Opcode) {
   8318 	case TGSI_OPCODE_ARL:
   8319 		alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
   8320 		break;
   8321 	case TGSI_OPCODE_ARR:
   8322 		alu.op = ALU_OP1_FLT_TO_INT;
   8323 		break;
   8324 	case TGSI_OPCODE_UARL:
   8325 		alu.op = ALU_OP1_MOV;
   8326 		break;
   8327 	default:
   8328 		assert(0);
   8329 		return -1;
   8330 	}
   8331 
   8332 	for (i = 0; i <= lasti; ++i) {
   8333 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   8334 			continue;
   8335 		r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   8336 		alu.last = i == lasti;
   8337 		alu.dst.sel = reg;
   8338 	        alu.dst.chan = i;
   8339 		alu.dst.write = 1;
   8340 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8341 		if (r)
   8342 			return r;
   8343 	}
   8344 
   8345 	if (inst->Dst[0].Register.Index > 0)
   8346 		ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
   8347 	else
   8348 		ctx->bc->ar_loaded = 0;
   8349 
   8350 	return 0;
   8351 }
   8352 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
   8353 {
   8354 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   8355 	struct r600_bytecode_alu alu;
   8356 	int r;
   8357 	int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   8358 
   8359 	switch (inst->Instruction.Opcode) {
   8360 	case TGSI_OPCODE_ARL:
   8361 		memset(&alu, 0, sizeof(alu));
   8362 		alu.op = ALU_OP1_FLOOR;
   8363 		alu.dst.sel = ctx->bc->ar_reg;
   8364 		alu.dst.write = 1;
   8365 		for (i = 0; i <= lasti; ++i) {
   8366 			if (inst->Dst[0].Register.WriteMask & (1 << i))  {
   8367 				alu.dst.chan = i;
   8368 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   8369 				alu.last = i == lasti;
   8370 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   8371 					return r;
   8372 			}
   8373 		}
   8374 
   8375 		memset(&alu, 0, sizeof(alu));
   8376 		alu.op = ALU_OP1_FLT_TO_INT;
   8377 		alu.src[0].sel = ctx->bc->ar_reg;
   8378 		alu.dst.sel = ctx->bc->ar_reg;
   8379 		alu.dst.write = 1;
   8380 		/* FLT_TO_INT is trans-only on r600/r700 */
   8381 		alu.last = TRUE;
   8382 		for (i = 0; i <= lasti; ++i) {
   8383 			alu.dst.chan = i;
   8384 			alu.src[0].chan = i;
   8385 			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   8386 				return r;
   8387 		}
   8388 		break;
   8389 	case TGSI_OPCODE_ARR:
   8390 		memset(&alu, 0, sizeof(alu));
   8391 		alu.op = ALU_OP1_FLT_TO_INT;
   8392 		alu.dst.sel = ctx->bc->ar_reg;
   8393 		alu.dst.write = 1;
   8394 		/* FLT_TO_INT is trans-only on r600/r700 */
   8395 		alu.last = TRUE;
   8396 		for (i = 0; i <= lasti; ++i) {
   8397 			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
   8398 				alu.dst.chan = i;
   8399 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   8400 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   8401 					return r;
   8402 			}
   8403 		}
   8404 		break;
   8405 	case TGSI_OPCODE_UARL:
   8406 		memset(&alu, 0, sizeof(alu));
   8407 		alu.op = ALU_OP1_MOV;
   8408 		alu.dst.sel = ctx->bc->ar_reg;
   8409 		alu.dst.write = 1;
   8410 		for (i = 0; i <= lasti; ++i) {
   8411 			if (inst->Dst[0].Register.WriteMask & (1 << i)) {
   8412 				alu.dst.chan = i;
   8413 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   8414 				alu.last = i == lasti;
   8415 				if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
   8416 					return r;
   8417 			}
   8418 		}
   8419 		break;
   8420 	default:
   8421 		assert(0);
   8422 		return -1;
   8423 	}
   8424 
   8425 	ctx->bc->ar_loaded = 0;
   8426 	return 0;
   8427 }
   8428 
   8429 static int tgsi_opdst(struct r600_shader_ctx *ctx)
   8430 {
   8431 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   8432 	struct r600_bytecode_alu alu;
   8433 	int i, r = 0;
   8434 
   8435 	for (i = 0; i < 4; i++) {
   8436 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8437 
   8438 		alu.op = ALU_OP2_MUL;
   8439 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   8440 
   8441 		if (i == 0 || i == 3) {
   8442 			alu.src[0].sel = V_SQ_ALU_SRC_1;
   8443 		} else {
   8444 			r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
   8445 		}
   8446 
   8447 		if (i == 0 || i == 2) {
   8448 			alu.src[1].sel = V_SQ_ALU_SRC_1;
   8449 		} else {
   8450 			r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
   8451 		}
   8452 		if (i == 3)
   8453 			alu.last = 1;
   8454 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8455 		if (r)
   8456 			return r;
   8457 	}
   8458 	return 0;
   8459 }
   8460 
   8461 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
   8462 {
   8463 	struct r600_bytecode_alu alu;
   8464 	int r;
   8465 
   8466 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8467 	alu.op = opcode;
   8468 	alu.execute_mask = 1;
   8469 	alu.update_pred = 1;
   8470 
   8471 	alu.dst.sel = ctx->temp_reg;
   8472 	alu.dst.write = 1;
   8473 	alu.dst.chan = 0;
   8474 
   8475 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8476 	alu.src[1].sel = V_SQ_ALU_SRC_0;
   8477 	alu.src[1].chan = 0;
   8478 
   8479 	alu.last = 1;
   8480 
   8481 	r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
   8482 	if (r)
   8483 		return r;
   8484 	return 0;
   8485 }
   8486 
   8487 static int pops(struct r600_shader_ctx *ctx, int pops)
   8488 {
   8489 	unsigned force_pop = ctx->bc->force_add_cf;
   8490 
   8491 	if (!force_pop) {
   8492 		int alu_pop = 3;
   8493 		if (ctx->bc->cf_last) {
   8494 			if (ctx->bc->cf_last->op == CF_OP_ALU)
   8495 				alu_pop = 0;
   8496 			else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
   8497 				alu_pop = 1;
   8498 		}
   8499 		alu_pop += pops;
   8500 		if (alu_pop == 1) {
   8501 			ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
   8502 			ctx->bc->force_add_cf = 1;
   8503 		} else if (alu_pop == 2) {
   8504 			ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
   8505 			ctx->bc->force_add_cf = 1;
   8506 		} else {
   8507 			force_pop = 1;
   8508 		}
   8509 	}
   8510 
   8511 	if (force_pop) {
   8512 		r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
   8513 		ctx->bc->cf_last->pop_count = pops;
   8514 		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
   8515 	}
   8516 
   8517 	return 0;
   8518 }
   8519 
   8520 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
   8521                                               unsigned reason)
   8522 {
   8523 	struct r600_stack_info *stack = &ctx->bc->stack;
   8524 	unsigned elements, entries;
   8525 
   8526 	unsigned entry_size = stack->entry_size;
   8527 
   8528 	elements = (stack->loop + stack->push_wqm ) * entry_size;
   8529 	elements += stack->push;
   8530 
   8531 	switch (ctx->bc->chip_class) {
   8532 	case R600:
   8533 	case R700:
   8534 		/* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
   8535 		 * the stack must be reserved to hold the current active/continue
   8536 		 * masks */
   8537 		if (reason == FC_PUSH_VPM) {
   8538 			elements += 2;
   8539 		}
   8540 		break;
   8541 
   8542 	case CAYMAN:
   8543 		/* r9xx: any stack operation on empty stack consumes 2 additional
   8544 		 * elements */
   8545 		elements += 2;
   8546 
   8547 		/* fallthrough */
   8548 		/* FIXME: do the two elements added above cover the cases for the
   8549 		 * r8xx+ below? */
   8550 
   8551 	case EVERGREEN:
   8552 		/* r8xx+: 2 extra elements are not always required, but one extra
   8553 		 * element must be added for each of the following cases:
   8554 		 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
   8555 		 *    stack usage.
   8556 		 *    (Currently we don't use ALU_ELSE_AFTER.)
   8557 		 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
   8558 		 *    PUSH instruction executed.
   8559 		 *
   8560 		 *    NOTE: it seems we also need to reserve additional element in some
   8561 		 *    other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
   8562 		 *    then STACK_SIZE should be 2 instead of 1 */
   8563 		if (reason == FC_PUSH_VPM) {
   8564 			elements += 1;
   8565 		}
   8566 		break;
   8567 
   8568 	default:
   8569 		assert(0);
   8570 		break;
   8571 	}
   8572 
   8573 	/* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
   8574 	 * for all chips, so we use 4 in the final formula, not the real entry_size
   8575 	 * for the chip */
   8576 	entry_size = 4;
   8577 
   8578 	entries = (elements + (entry_size - 1)) / entry_size;
   8579 
   8580 	if (entries > stack->max_entries)
   8581 		stack->max_entries = entries;
   8582 }
   8583 
   8584 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
   8585 {
   8586 	switch(reason) {
   8587 	case FC_PUSH_VPM:
   8588 		--ctx->bc->stack.push;
   8589 		assert(ctx->bc->stack.push >= 0);
   8590 		break;
   8591 	case FC_PUSH_WQM:
   8592 		--ctx->bc->stack.push_wqm;
   8593 		assert(ctx->bc->stack.push_wqm >= 0);
   8594 		break;
   8595 	case FC_LOOP:
   8596 		--ctx->bc->stack.loop;
   8597 		assert(ctx->bc->stack.loop >= 0);
   8598 		break;
   8599 	default:
   8600 		assert(0);
   8601 		break;
   8602 	}
   8603 }
   8604 
   8605 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
   8606 {
   8607 	switch (reason) {
   8608 	case FC_PUSH_VPM:
   8609 		++ctx->bc->stack.push;
   8610 		break;
   8611 	case FC_PUSH_WQM:
   8612 		++ctx->bc->stack.push_wqm;
   8613 	case FC_LOOP:
   8614 		++ctx->bc->stack.loop;
   8615 		break;
   8616 	default:
   8617 		assert(0);
   8618 	}
   8619 
   8620 	callstack_update_max_depth(ctx, reason);
   8621 }
   8622 
   8623 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
   8624 {
   8625 	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
   8626 
   8627 	sp->mid = realloc((void *)sp->mid,
   8628 						sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
   8629 	sp->mid[sp->num_mid] = ctx->bc->cf_last;
   8630 	sp->num_mid++;
   8631 }
   8632 
   8633 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
   8634 {
   8635 	ctx->bc->fc_sp++;
   8636 	ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
   8637 	ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
   8638 }
   8639 
   8640 static void fc_poplevel(struct r600_shader_ctx *ctx)
   8641 {
   8642 	struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
   8643 	free(sp->mid);
   8644 	sp->mid = NULL;
   8645 	sp->num_mid = 0;
   8646 	sp->start = NULL;
   8647 	sp->type = 0;
   8648 	ctx->bc->fc_sp--;
   8649 }
   8650 
   8651 #if 0
   8652 static int emit_return(struct r600_shader_ctx *ctx)
   8653 {
   8654 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
   8655 	return 0;
   8656 }
   8657 
   8658 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
   8659 {
   8660 
   8661 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
   8662 	ctx->bc->cf_last->pop_count = pops;
   8663 	/* XXX work out offset */
   8664 	return 0;
   8665 }
   8666 
   8667 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
   8668 {
   8669 	return 0;
   8670 }
   8671 
   8672 static void emit_testflag(struct r600_shader_ctx *ctx)
   8673 {
   8674 
   8675 }
   8676 
   8677 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
   8678 {
   8679 	emit_testflag(ctx);
   8680 	emit_jump_to_offset(ctx, 1, 4);
   8681 	emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
   8682 	pops(ctx, ifidx + 1);
   8683 	emit_return(ctx);
   8684 }
   8685 
   8686 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
   8687 {
   8688 	emit_testflag(ctx);
   8689 
   8690 	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
   8691 	ctx->bc->cf_last->pop_count = 1;
   8692 
   8693 	fc_set_mid(ctx, fc_sp);
   8694 
   8695 	pops(ctx, 1);
   8696 }
   8697 #endif
   8698 
   8699 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
   8700 {
   8701 	int alu_type = CF_OP_ALU_PUSH_BEFORE;
   8702 
   8703 	/* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
   8704 	 * LOOP_STARTxxx for nested loops may put the branch stack into a state
   8705 	 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
   8706 	 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
   8707 	if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
   8708 		r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
   8709 		ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
   8710 		alu_type = CF_OP_ALU;
   8711 	}
   8712 
   8713 	emit_logic_pred(ctx, opcode, alu_type);
   8714 
   8715 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
   8716 
   8717 	fc_pushlevel(ctx, FC_IF);
   8718 
   8719 	callstack_push(ctx, FC_PUSH_VPM);
   8720 	return 0;
   8721 }
   8722 
   8723 static int tgsi_if(struct r600_shader_ctx *ctx)
   8724 {
   8725 	return emit_if(ctx, ALU_OP2_PRED_SETNE);
   8726 }
   8727 
   8728 static int tgsi_uif(struct r600_shader_ctx *ctx)
   8729 {
   8730 	return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
   8731 }
   8732 
   8733 static int tgsi_else(struct r600_shader_ctx *ctx)
   8734 {
   8735 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
   8736 	ctx->bc->cf_last->pop_count = 1;
   8737 
   8738 	fc_set_mid(ctx, ctx->bc->fc_sp);
   8739 	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
   8740 	return 0;
   8741 }
   8742 
   8743 static int tgsi_endif(struct r600_shader_ctx *ctx)
   8744 {
   8745 	pops(ctx, 1);
   8746 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
   8747 		R600_ERR("if/endif unbalanced in shader\n");
   8748 		return -1;
   8749 	}
   8750 
   8751 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
   8752 		ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
   8753 		ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
   8754 	} else {
   8755 		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
   8756 	}
   8757 	fc_poplevel(ctx);
   8758 
   8759 	callstack_pop(ctx, FC_PUSH_VPM);
   8760 	return 0;
   8761 }
   8762 
   8763 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
   8764 {
   8765 	/* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
   8766 	 * limited to 4096 iterations, like the other LOOP_* instructions. */
   8767 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
   8768 
   8769 	fc_pushlevel(ctx, FC_LOOP);
   8770 
   8771 	/* check stack depth */
   8772 	callstack_push(ctx, FC_LOOP);
   8773 	return 0;
   8774 }
   8775 
   8776 static int tgsi_endloop(struct r600_shader_ctx *ctx)
   8777 {
   8778 	unsigned i;
   8779 
   8780 	r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
   8781 
   8782 	if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
   8783 		R600_ERR("loop/endloop in shader code are not paired.\n");
   8784 		return -EINVAL;
   8785 	}
   8786 
   8787 	/* fixup loop pointers - from r600isa
   8788 	   LOOP END points to CF after LOOP START,
   8789 	   LOOP START point to CF after LOOP END
   8790 	   BRK/CONT point to LOOP END CF
   8791 	*/
   8792 	ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
   8793 
   8794 	ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
   8795 
   8796 	for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
   8797 		ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
   8798 	}
   8799 	/* XXX add LOOPRET support */
   8800 	fc_poplevel(ctx);
   8801 	callstack_pop(ctx, FC_LOOP);
   8802 	return 0;
   8803 }
   8804 
   8805 static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
   8806 {
   8807 	int r;
   8808 	unsigned int fscp;
   8809 
   8810 	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
   8811 	{
   8812 		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
   8813 			break;
   8814 	}
   8815 	if (fscp == 0) {
   8816 		R600_ERR("BREAKC not inside loop/endloop pair\n");
   8817 		return -EINVAL;
   8818 	}
   8819 
   8820 	if (ctx->bc->chip_class == EVERGREEN &&
   8821 	    ctx->bc->family != CHIP_CYPRESS &&
   8822 	    ctx->bc->family != CHIP_JUNIPER) {
   8823 		/* HW bug: ALU_BREAK does not save the active mask correctly */
   8824 		r = tgsi_uif(ctx);
   8825 		if (r)
   8826 			return r;
   8827 
   8828 		r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
   8829 		if (r)
   8830 			return r;
   8831 		fc_set_mid(ctx, fscp);
   8832 
   8833 		return tgsi_endif(ctx);
   8834 	} else {
   8835 		r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
   8836 		if (r)
   8837 			return r;
   8838 		fc_set_mid(ctx, fscp);
   8839 	}
   8840 
   8841 	return 0;
   8842 }
   8843 
   8844 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
   8845 {
   8846 	unsigned int fscp;
   8847 
   8848 	for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
   8849 	{
   8850 		if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
   8851 			break;
   8852 	}
   8853 
   8854 	if (fscp == 0) {
   8855 		R600_ERR("Break not inside loop/endloop pair\n");
   8856 		return -EINVAL;
   8857 	}
   8858 
   8859 	r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
   8860 
   8861 	fc_set_mid(ctx, fscp);
   8862 
   8863 	return 0;
   8864 }
   8865 
   8866 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
   8867 {
   8868 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   8869 	int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
   8870 	int r;
   8871 
   8872 	if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
   8873 		emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
   8874 
   8875 	r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
   8876 	if (!r) {
   8877 		ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
   8878 		if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
   8879 			return emit_inc_ring_offset(ctx, stream, TRUE);
   8880 	}
   8881 	return r;
   8882 }
   8883 
   8884 static int tgsi_umad(struct r600_shader_ctx *ctx)
   8885 {
   8886 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   8887 	struct r600_bytecode_alu alu;
   8888 	int i, j, k, r;
   8889 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   8890 
   8891 	/* src0 * src1 */
   8892 	for (i = 0; i < lasti + 1; i++) {
   8893 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   8894 			continue;
   8895 
   8896 		if (ctx->bc->chip_class == CAYMAN) {
   8897 			for (j = 0 ; j < 4; j++) {
   8898 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8899 
   8900 				alu.op = ALU_OP2_MULLO_UINT;
   8901 				for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
   8902 					r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
   8903 				}
   8904 				alu.dst.chan = j;
   8905 				alu.dst.sel = ctx->temp_reg;
   8906 				alu.dst.write = (j == i);
   8907 				if (j == 3)
   8908 					alu.last = 1;
   8909 				r = r600_bytecode_add_alu(ctx->bc, &alu);
   8910 				if (r)
   8911 					return r;
   8912 			}
   8913 		} else {
   8914 			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8915 
   8916 			alu.dst.chan = i;
   8917 			alu.dst.sel = ctx->temp_reg;
   8918 			alu.dst.write = 1;
   8919 
   8920 			alu.op = ALU_OP2_MULLO_UINT;
   8921 			for (j = 0; j < 2; j++) {
   8922 				r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
   8923 			}
   8924 
   8925 			alu.last = 1;
   8926 			r = r600_bytecode_add_alu(ctx->bc, &alu);
   8927 			if (r)
   8928 				return r;
   8929 		}
   8930 	}
   8931 
   8932 
   8933 	for (i = 0; i < lasti + 1; i++) {
   8934 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   8935 			continue;
   8936 
   8937 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8938 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   8939 
   8940 		alu.op = ALU_OP2_ADD_INT;
   8941 
   8942 		alu.src[0].sel = ctx->temp_reg;
   8943 		alu.src[0].chan = i;
   8944 
   8945 		r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
   8946 		if (i == lasti) {
   8947 			alu.last = 1;
   8948 		}
   8949 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8950 		if (r)
   8951 			return r;
   8952 	}
   8953 	return 0;
   8954 }
   8955 
   8956 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
   8957 {
   8958 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   8959 	struct r600_bytecode_alu alu;
   8960 	int r, i;
   8961 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   8962 
   8963 	/* temp.xy = f32_to_f16(src) */
   8964 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8965 	alu.op = ALU_OP1_FLT32_TO_FLT16;
   8966 	alu.dst.chan = 0;
   8967 	alu.dst.sel = ctx->temp_reg;
   8968 	alu.dst.write = 1;
   8969 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   8970 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   8971 	if (r)
   8972 		return r;
   8973 	alu.dst.chan = 1;
   8974 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
   8975 	alu.last = 1;
   8976 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   8977 	if (r)
   8978 		return r;
   8979 
   8980 	/* dst.x = temp.y * 0x10000 + temp.x */
   8981 	for (i = 0; i < lasti + 1; i++) {
   8982 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   8983 			continue;
   8984 
   8985 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   8986 		alu.op = ALU_OP3_MULADD_UINT24;
   8987 		alu.is_op3 = 1;
   8988 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   8989 		alu.last = i == lasti;
   8990 		alu.src[0].sel = ctx->temp_reg;
   8991 		alu.src[0].chan = 1;
   8992 		alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   8993 		alu.src[1].value = 0x10000;
   8994 		alu.src[2].sel = ctx->temp_reg;
   8995 		alu.src[2].chan = 0;
   8996 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   8997 		if (r)
   8998 			return r;
   8999 	}
   9000 
   9001 	return 0;
   9002 }
   9003 
   9004 static int tgsi_up2h(struct r600_shader_ctx *ctx)
   9005 {
   9006 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
   9007 	struct r600_bytecode_alu alu;
   9008 	int r, i;
   9009 	int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
   9010 
   9011 	/* temp.x = src.x */
   9012 	/* note: no need to mask out the high bits */
   9013 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   9014 	alu.op = ALU_OP1_MOV;
   9015 	alu.dst.chan = 0;
   9016 	alu.dst.sel = ctx->temp_reg;
   9017 	alu.dst.write = 1;
   9018 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   9019 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   9020 	if (r)
   9021 		return r;
   9022 
   9023 	/* temp.y = src.x >> 16 */
   9024 	memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   9025 	alu.op = ALU_OP2_LSHR_INT;
   9026 	alu.dst.chan = 1;
   9027 	alu.dst.sel = ctx->temp_reg;
   9028 	alu.dst.write = 1;
   9029 	r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
   9030 	alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
   9031 	alu.src[1].value = 16;
   9032 	alu.last = 1;
   9033 	r = r600_bytecode_add_alu(ctx->bc, &alu);
   9034 	if (r)
   9035 		return r;
   9036 
   9037 	/* dst.wz = dst.xy = f16_to_f32(temp.xy) */
   9038 	for (i = 0; i < lasti + 1; i++) {
   9039 		if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
   9040 			continue;
   9041 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
   9042 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
   9043 		alu.op = ALU_OP1_FLT16_TO_FLT32;
   9044 		alu.src[0].sel = ctx->temp_reg;
   9045 		alu.src[0].chan = i % 2;
   9046 		alu.last = i == lasti;
   9047 		r = r600_bytecode_add_alu(ctx->bc, &alu);
   9048 		if (r)
   9049 			return r;
   9050 	}
   9051 
   9052 	return 0;
   9053 }
   9054 
   9055 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
   9056 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_r600_arl},
   9057 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
   9058 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
   9059 
   9060 	/* XXX:
   9061 	 * For state trackers other than OpenGL, we'll want to use
   9062 	 * _RECIP_IEEE instead.
   9063 	 */
   9064 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
   9065 
   9066 	[TGSI_OPCODE_RSQ]	= { ALU_OP0_NOP, tgsi_rsq},
   9067 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
   9068 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
   9069 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
   9070 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
   9071 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
   9072 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
   9073 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
   9074 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
   9075 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
   9076 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
   9077 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
   9078 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
   9079 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
   9080 	[TGSI_OPCODE_FMA]	= { ALU_OP0_NOP, tgsi_unsupported},
   9081 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
   9082 	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
   9083 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
   9084 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
   9085 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
   9086 	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
   9087 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
   9088 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
   9089 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
   9090 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
   9091 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
   9092 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
   9093 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
   9094 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
   9095 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
   9096 	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
   9097 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
   9098 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
   9099 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
   9100 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
   9101 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_unsupported},
   9102 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
   9103 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
   9104 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9105 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
   9106 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
   9107 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
   9108 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
   9109 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
   9110 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
   9111 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
   9112 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
   9113 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9114 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
   9115 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9116 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_unsupported},
   9117 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
   9118 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
   9119 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9120 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
   9121 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
   9122 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_r600_arl},
   9123 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
   9124 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
   9125 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
   9126 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
   9127 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
   9128 	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
   9129 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
   9130 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
   9131 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
   9132 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
   9133 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
   9134 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
   9135 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
   9136 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
   9137 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
   9138 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
   9139 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
   9140 	[TGSI_OPCODE_DDX_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9141 	[TGSI_OPCODE_DDY_FINE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9142 	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
   9143 	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
   9144 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
   9145 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
   9146 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
   9147 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
   9148 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2_trans},
   9149 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
   9150 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
   9151 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
   9152 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
   9153 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
   9154 	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9155 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
   9156 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
   9157 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
   9158 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
   9159 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
   9160 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
   9161 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9162 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
   9163 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9164 	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
   9165 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
   9166 	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
   9167 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
   9168 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
   9169 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
   9170 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
   9171 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
   9172 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
   9173 	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9174 	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
   9175 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
   9176 	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_loop_breakc},
   9177 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
   9178 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
   9179 	[TGSI_OPCODE_DFMA]	= { ALU_OP0_NOP, tgsi_unsupported},
   9180 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
   9181 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
   9182 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
   9183 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
   9184 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
   9185 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
   9186 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2_trans},
   9187 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
   9188 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
   9189 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
   9190 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
   9191 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
   9192 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
   9193 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
   9194 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
   9195 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
   9196 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
   9197 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
   9198 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
   9199 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2_trans},
   9200 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
   9201 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2_swap},
   9202 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
   9203 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9204 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
   9205 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
   9206 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
   9207 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
   9208 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
   9209 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
   9210 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
   9211 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
   9212 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
   9213 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
   9214 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
   9215 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
   9216 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
   9217 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
   9218 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_r600_arl},
   9219 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
   9220 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
   9221 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
   9222 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9223 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9224 	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9225 	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9226 	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9227 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_NOP, tgsi_unsupported},
   9228 	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9229 	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
   9230 	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
   9231 	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
   9232 	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9233 	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9234 	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
   9235 	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
   9236 	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
   9237 	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
   9238 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9239 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
   9240 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
   9241 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
   9242 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
   9243 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_unsupported},
   9244 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_unsupported},
   9245 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_unsupported},
   9246 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_unsupported},
   9247 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_unsupported},
   9248 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_unsupported},
   9249 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_unsupported},
   9250 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_unsupported},
   9251 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_unsupported},
   9252 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_unsupported},
   9253 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_unsupported},
   9254 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9255 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_unsupported},
   9256 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
   9257 };
   9258 
   9259 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
   9260 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
   9261 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
   9262 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
   9263 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
   9264 	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
   9265 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
   9266 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
   9267 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
   9268 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
   9269 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
   9270 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
   9271 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
   9272 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
   9273 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
   9274 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
   9275 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
   9276 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
   9277 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
   9278 	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
   9279 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
   9280 	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
   9281 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
   9282 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
   9283 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
   9284 	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
   9285 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
   9286 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
   9287 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
   9288 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
   9289 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, tgsi_pow},
   9290 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
   9291 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
   9292 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
   9293 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
   9294 	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
   9295 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, tgsi_trig},
   9296 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
   9297 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
   9298 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
   9299 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
   9300 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
   9301 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
   9302 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9303 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
   9304 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
   9305 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
   9306 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
   9307 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, tgsi_trig},
   9308 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
   9309 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
   9310 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
   9311 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9312 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
   9313 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9314 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
   9315 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
   9316 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
   9317 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9318 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
   9319 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
   9320 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
   9321 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
   9322 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
   9323 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
   9324 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
   9325 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
   9326 	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
   9327 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
   9328 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
   9329 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
   9330 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
   9331 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
   9332 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
   9333 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
   9334 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
   9335 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
   9336 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
   9337 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
   9338 	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
   9339 	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
   9340 	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
   9341 	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
   9342 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
   9343 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
   9344 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
   9345 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
   9346 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
   9347 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
   9348 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
   9349 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
   9350 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
   9351 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
   9352 	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9353 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
   9354 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
   9355 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
   9356 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
   9357 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
   9358 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
   9359 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9360 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
   9361 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9362 	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
   9363 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
   9364 	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
   9365 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
   9366 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
   9367 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
   9368 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
   9369 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
   9370 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
   9371 	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9372 	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
   9373 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
   9374 	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
   9375 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
   9376 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
   9377 	/* Refer below for TGSI_OPCODE_DFMA */
   9378 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_f2i},
   9379 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
   9380 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
   9381 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
   9382 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
   9383 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
   9384 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
   9385 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
   9386 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
   9387 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
   9388 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
   9389 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
   9390 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
   9391 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
   9392 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
   9393 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
   9394 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
   9395 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
   9396 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
   9397 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
   9398 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
   9399 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
   9400 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
   9401 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9402 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
   9403 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
   9404 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
   9405 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
   9406 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
   9407 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
   9408 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
   9409 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
   9410 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
   9411 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
   9412 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
   9413 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
   9414 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
   9415 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
   9416 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
   9417 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
   9418 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
   9419 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
   9420 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9421 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9422 	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9423 	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9424 	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9425 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
   9426 	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9427 	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
   9428 	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
   9429 	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
   9430 	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9431 	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9432 	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
   9433 	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
   9434 	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
   9435 	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
   9436 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9437 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
   9438 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
   9439 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, tgsi_op2_trans},
   9440 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
   9441 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
   9442 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
   9443 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
   9444 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
   9445 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
   9446 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
   9447 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
   9448 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
   9449 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
   9450 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
   9451 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
   9452 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
   9453 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
   9454 	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
   9455 	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
   9456 	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
   9457 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
   9458 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
   9459 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
   9460 	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
   9461 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
   9462 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
   9463 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
   9464 	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
   9465 	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
   9466 	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
   9467 	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
   9468 	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
   9469 	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
   9470 	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
   9471 	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
   9472 	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
   9473 	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
   9474 	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
   9475 	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
   9476 	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
   9477 	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
   9478 	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
   9479 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
   9480 };
   9481 
   9482 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
   9483 	[TGSI_OPCODE_ARL]	= { ALU_OP0_NOP, tgsi_eg_arl},
   9484 	[TGSI_OPCODE_MOV]	= { ALU_OP1_MOV, tgsi_op2},
   9485 	[TGSI_OPCODE_LIT]	= { ALU_OP0_NOP, tgsi_lit},
   9486 	[TGSI_OPCODE_RCP]	= { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
   9487 	[TGSI_OPCODE_RSQ]	= { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
   9488 	[TGSI_OPCODE_EXP]	= { ALU_OP0_NOP, tgsi_exp},
   9489 	[TGSI_OPCODE_LOG]	= { ALU_OP0_NOP, tgsi_log},
   9490 	[TGSI_OPCODE_MUL]	= { ALU_OP2_MUL, tgsi_op2},
   9491 	[TGSI_OPCODE_ADD]	= { ALU_OP2_ADD, tgsi_op2},
   9492 	[TGSI_OPCODE_DP3]	= { ALU_OP2_DOT4, tgsi_dp},
   9493 	[TGSI_OPCODE_DP4]	= { ALU_OP2_DOT4, tgsi_dp},
   9494 	[TGSI_OPCODE_DST]	= { ALU_OP0_NOP, tgsi_opdst},
   9495 	[TGSI_OPCODE_MIN]	= { ALU_OP2_MIN, tgsi_op2},
   9496 	[TGSI_OPCODE_MAX]	= { ALU_OP2_MAX, tgsi_op2},
   9497 	[TGSI_OPCODE_SLT]	= { ALU_OP2_SETGT, tgsi_op2_swap},
   9498 	[TGSI_OPCODE_SGE]	= { ALU_OP2_SETGE, tgsi_op2},
   9499 	[TGSI_OPCODE_MAD]	= { ALU_OP3_MULADD, tgsi_op3},
   9500 	[TGSI_OPCODE_LRP]	= { ALU_OP0_NOP, tgsi_lrp},
   9501 	[TGSI_OPCODE_FMA]	= { ALU_OP3_FMA, tgsi_op3},
   9502 	[TGSI_OPCODE_SQRT]	= { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
   9503 	[TGSI_OPCODE_DP2A]	= { ALU_OP0_NOP, tgsi_unsupported},
   9504 	[22]			= { ALU_OP0_NOP, tgsi_unsupported},
   9505 	[23]			= { ALU_OP0_NOP, tgsi_unsupported},
   9506 	[TGSI_OPCODE_FRC]	= { ALU_OP1_FRACT, tgsi_op2},
   9507 	[TGSI_OPCODE_CLAMP]	= { ALU_OP0_NOP, tgsi_unsupported},
   9508 	[TGSI_OPCODE_FLR]	= { ALU_OP1_FLOOR, tgsi_op2},
   9509 	[TGSI_OPCODE_ROUND]	= { ALU_OP1_RNDNE, tgsi_op2},
   9510 	[TGSI_OPCODE_EX2]	= { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
   9511 	[TGSI_OPCODE_LG2]	= { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
   9512 	[TGSI_OPCODE_POW]	= { ALU_OP0_NOP, cayman_pow},
   9513 	[TGSI_OPCODE_XPD]	= { ALU_OP0_NOP, tgsi_xpd},
   9514 	[32]			= { ALU_OP0_NOP, tgsi_unsupported},
   9515 	[33]			= { ALU_OP0_NOP, tgsi_unsupported},
   9516 	[34]			= { ALU_OP0_NOP, tgsi_unsupported},
   9517 	[TGSI_OPCODE_DPH]	= { ALU_OP2_DOT4, tgsi_dp},
   9518 	[TGSI_OPCODE_COS]	= { ALU_OP1_COS, cayman_trig},
   9519 	[TGSI_OPCODE_DDX]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
   9520 	[TGSI_OPCODE_DDY]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
   9521 	[TGSI_OPCODE_KILL]	= { ALU_OP2_KILLGT, tgsi_kill},  /* unconditional kill */
   9522 	[TGSI_OPCODE_PK2H]	= { ALU_OP0_NOP, tgsi_pk2h},
   9523 	[TGSI_OPCODE_PK2US]	= { ALU_OP0_NOP, tgsi_unsupported},
   9524 	[TGSI_OPCODE_PK4B]	= { ALU_OP0_NOP, tgsi_unsupported},
   9525 	[TGSI_OPCODE_PK4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9526 	[44]			= { ALU_OP0_NOP, tgsi_unsupported},
   9527 	[TGSI_OPCODE_SEQ]	= { ALU_OP2_SETE, tgsi_op2},
   9528 	[46]			= { ALU_OP0_NOP, tgsi_unsupported},
   9529 	[TGSI_OPCODE_SGT]	= { ALU_OP2_SETGT, tgsi_op2},
   9530 	[TGSI_OPCODE_SIN]	= { ALU_OP1_SIN, cayman_trig},
   9531 	[TGSI_OPCODE_SLE]	= { ALU_OP2_SETGE, tgsi_op2_swap},
   9532 	[TGSI_OPCODE_SNE]	= { ALU_OP2_SETNE, tgsi_op2},
   9533 	[51]			= { ALU_OP0_NOP, tgsi_unsupported},
   9534 	[TGSI_OPCODE_TEX]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9535 	[TGSI_OPCODE_TXD]	= { FETCH_OP_SAMPLE_G, tgsi_tex},
   9536 	[TGSI_OPCODE_TXP]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9537 	[TGSI_OPCODE_UP2H]	= { ALU_OP0_NOP, tgsi_up2h},
   9538 	[TGSI_OPCODE_UP2US]	= { ALU_OP0_NOP, tgsi_unsupported},
   9539 	[TGSI_OPCODE_UP4B]	= { ALU_OP0_NOP, tgsi_unsupported},
   9540 	[TGSI_OPCODE_UP4UB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9541 	[59]			= { ALU_OP0_NOP, tgsi_unsupported},
   9542 	[60]			= { ALU_OP0_NOP, tgsi_unsupported},
   9543 	[TGSI_OPCODE_ARR]	= { ALU_OP0_NOP, tgsi_eg_arl},
   9544 	[62]			= { ALU_OP0_NOP, tgsi_unsupported},
   9545 	[TGSI_OPCODE_CAL]	= { ALU_OP0_NOP, tgsi_unsupported},
   9546 	[TGSI_OPCODE_RET]	= { ALU_OP0_NOP, tgsi_unsupported},
   9547 	[TGSI_OPCODE_SSG]	= { ALU_OP0_NOP, tgsi_ssg},
   9548 	[TGSI_OPCODE_CMP]	= { ALU_OP0_NOP, tgsi_cmp},
   9549 	[TGSI_OPCODE_SCS]	= { ALU_OP0_NOP, tgsi_scs},
   9550 	[TGSI_OPCODE_TXB]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
   9551 	[69]			= { ALU_OP0_NOP, tgsi_unsupported},
   9552 	[TGSI_OPCODE_DIV]	= { ALU_OP0_NOP, tgsi_unsupported},
   9553 	[TGSI_OPCODE_DP2]	= { ALU_OP2_DOT4, tgsi_dp},
   9554 	[TGSI_OPCODE_TXL]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
   9555 	[TGSI_OPCODE_BRK]	= { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
   9556 	[TGSI_OPCODE_IF]	= { ALU_OP0_NOP, tgsi_if},
   9557 	[TGSI_OPCODE_UIF]	= { ALU_OP0_NOP, tgsi_uif},
   9558 	[76]			= { ALU_OP0_NOP, tgsi_unsupported},
   9559 	[TGSI_OPCODE_ELSE]	= { ALU_OP0_NOP, tgsi_else},
   9560 	[TGSI_OPCODE_ENDIF]	= { ALU_OP0_NOP, tgsi_endif},
   9561 	[TGSI_OPCODE_DDX_FINE]	= { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
   9562 	[TGSI_OPCODE_DDY_FINE]	= { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
   9563 	[TGSI_OPCODE_PUSHA]	= { ALU_OP0_NOP, tgsi_unsupported},
   9564 	[TGSI_OPCODE_POPA]	= { ALU_OP0_NOP, tgsi_unsupported},
   9565 	[TGSI_OPCODE_CEIL]	= { ALU_OP1_CEIL, tgsi_op2},
   9566 	[TGSI_OPCODE_I2F]	= { ALU_OP1_INT_TO_FLT, tgsi_op2},
   9567 	[TGSI_OPCODE_NOT]	= { ALU_OP1_NOT_INT, tgsi_op2},
   9568 	[TGSI_OPCODE_TRUNC]	= { ALU_OP1_TRUNC, tgsi_op2},
   9569 	[TGSI_OPCODE_SHL]	= { ALU_OP2_LSHL_INT, tgsi_op2},
   9570 	[88]			= { ALU_OP0_NOP, tgsi_unsupported},
   9571 	[TGSI_OPCODE_AND]	= { ALU_OP2_AND_INT, tgsi_op2},
   9572 	[TGSI_OPCODE_OR]	= { ALU_OP2_OR_INT, tgsi_op2},
   9573 	[TGSI_OPCODE_MOD]	= { ALU_OP0_NOP, tgsi_imod},
   9574 	[TGSI_OPCODE_XOR]	= { ALU_OP2_XOR_INT, tgsi_op2},
   9575 	[TGSI_OPCODE_SAD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9576 	[TGSI_OPCODE_TXF]	= { FETCH_OP_LD, tgsi_tex},
   9577 	[TGSI_OPCODE_TXQ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
   9578 	[TGSI_OPCODE_CONT]	= { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
   9579 	[TGSI_OPCODE_EMIT]	= { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
   9580 	[TGSI_OPCODE_ENDPRIM]	= { CF_OP_CUT_VERTEX, tgsi_gs_emit},
   9581 	[TGSI_OPCODE_BGNLOOP]	= { ALU_OP0_NOP, tgsi_bgnloop},
   9582 	[TGSI_OPCODE_BGNSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9583 	[TGSI_OPCODE_ENDLOOP]	= { ALU_OP0_NOP, tgsi_endloop},
   9584 	[TGSI_OPCODE_ENDSUB]	= { ALU_OP0_NOP, tgsi_unsupported},
   9585 	[TGSI_OPCODE_TXQ_LZ]	= { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
   9586 	[TGSI_OPCODE_TXQS]	= { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
   9587 	[TGSI_OPCODE_RESQ]	= { ALU_OP0_NOP, tgsi_unsupported},
   9588 	[106]			= { ALU_OP0_NOP, tgsi_unsupported},
   9589 	[TGSI_OPCODE_NOP]	= { ALU_OP0_NOP, tgsi_unsupported},
   9590 	[TGSI_OPCODE_FSEQ]	= { ALU_OP2_SETE_DX10, tgsi_op2},
   9591 	[TGSI_OPCODE_FSGE]	= { ALU_OP2_SETGE_DX10, tgsi_op2},
   9592 	[TGSI_OPCODE_FSLT]	= { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
   9593 	[TGSI_OPCODE_FSNE]	= { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
   9594 	[TGSI_OPCODE_MEMBAR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9595 	[TGSI_OPCODE_CALLNZ]	= { ALU_OP0_NOP, tgsi_unsupported},
   9596 	[114]			= { ALU_OP0_NOP, tgsi_unsupported},
   9597 	[TGSI_OPCODE_BREAKC]	= { ALU_OP0_NOP, tgsi_unsupported},
   9598 	[TGSI_OPCODE_KILL_IF]	= { ALU_OP2_KILLGT, tgsi_kill},  /* conditional kill */
   9599 	[TGSI_OPCODE_END]	= { ALU_OP0_NOP, tgsi_end},  /* aka HALT */
   9600 	/* Refer below for TGSI_OPCODE_DFMA */
   9601 	[TGSI_OPCODE_F2I]	= { ALU_OP1_FLT_TO_INT, tgsi_op2},
   9602 	[TGSI_OPCODE_IDIV]	= { ALU_OP0_NOP, tgsi_idiv},
   9603 	[TGSI_OPCODE_IMAX]	= { ALU_OP2_MAX_INT, tgsi_op2},
   9604 	[TGSI_OPCODE_IMIN]	= { ALU_OP2_MIN_INT, tgsi_op2},
   9605 	[TGSI_OPCODE_INEG]	= { ALU_OP2_SUB_INT, tgsi_ineg},
   9606 	[TGSI_OPCODE_ISGE]	= { ALU_OP2_SETGE_INT, tgsi_op2},
   9607 	[TGSI_OPCODE_ISHR]	= { ALU_OP2_ASHR_INT, tgsi_op2},
   9608 	[TGSI_OPCODE_ISLT]	= { ALU_OP2_SETGT_INT, tgsi_op2_swap},
   9609 	[TGSI_OPCODE_F2U]	= { ALU_OP1_FLT_TO_UINT, tgsi_op2},
   9610 	[TGSI_OPCODE_U2F]	= { ALU_OP1_UINT_TO_FLT, tgsi_op2},
   9611 	[TGSI_OPCODE_UADD]	= { ALU_OP2_ADD_INT, tgsi_op2},
   9612 	[TGSI_OPCODE_UDIV]	= { ALU_OP0_NOP, tgsi_udiv},
   9613 	[TGSI_OPCODE_UMAD]	= { ALU_OP0_NOP, tgsi_umad},
   9614 	[TGSI_OPCODE_UMAX]	= { ALU_OP2_MAX_UINT, tgsi_op2},
   9615 	[TGSI_OPCODE_UMIN]	= { ALU_OP2_MIN_UINT, tgsi_op2},
   9616 	[TGSI_OPCODE_UMOD]	= { ALU_OP0_NOP, tgsi_umod},
   9617 	[TGSI_OPCODE_UMUL]	= { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
   9618 	[TGSI_OPCODE_USEQ]	= { ALU_OP2_SETE_INT, tgsi_op2},
   9619 	[TGSI_OPCODE_USGE]	= { ALU_OP2_SETGE_UINT, tgsi_op2},
   9620 	[TGSI_OPCODE_USHR]	= { ALU_OP2_LSHR_INT, tgsi_op2},
   9621 	[TGSI_OPCODE_USLT]	= { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
   9622 	[TGSI_OPCODE_USNE]	= { ALU_OP2_SETNE_INT, tgsi_op2},
   9623 	[TGSI_OPCODE_SWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
   9624 	[TGSI_OPCODE_CASE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9625 	[TGSI_OPCODE_DEFAULT]	= { ALU_OP0_NOP, tgsi_unsupported},
   9626 	[TGSI_OPCODE_ENDSWITCH]	= { ALU_OP0_NOP, tgsi_unsupported},
   9627 	[TGSI_OPCODE_SAMPLE]	= { 0, tgsi_unsupported},
   9628 	[TGSI_OPCODE_SAMPLE_I]	= { 0, tgsi_unsupported},
   9629 	[TGSI_OPCODE_SAMPLE_I_MS]	= { 0, tgsi_unsupported},
   9630 	[TGSI_OPCODE_SAMPLE_B]	= { 0, tgsi_unsupported},
   9631 	[TGSI_OPCODE_SAMPLE_C]	= { 0, tgsi_unsupported},
   9632 	[TGSI_OPCODE_SAMPLE_C_LZ]	= { 0, tgsi_unsupported},
   9633 	[TGSI_OPCODE_SAMPLE_D]	= { 0, tgsi_unsupported},
   9634 	[TGSI_OPCODE_SAMPLE_L]	= { 0, tgsi_unsupported},
   9635 	[TGSI_OPCODE_GATHER4]	= { 0, tgsi_unsupported},
   9636 	[TGSI_OPCODE_SVIEWINFO]	= { 0, tgsi_unsupported},
   9637 	[TGSI_OPCODE_SAMPLE_POS]	= { 0, tgsi_unsupported},
   9638 	[TGSI_OPCODE_SAMPLE_INFO]	= { 0, tgsi_unsupported},
   9639 	[TGSI_OPCODE_UARL]	= { ALU_OP1_MOVA_INT, tgsi_eg_arl},
   9640 	[TGSI_OPCODE_UCMP]	= { ALU_OP0_NOP, tgsi_ucmp},
   9641 	[TGSI_OPCODE_IABS]	= { 0, tgsi_iabs},
   9642 	[TGSI_OPCODE_ISSG]	= { 0, tgsi_issg},
   9643 	[TGSI_OPCODE_LOAD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9644 	[TGSI_OPCODE_STORE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9645 	[TGSI_OPCODE_MFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9646 	[TGSI_OPCODE_LFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9647 	[TGSI_OPCODE_SFENCE]	= { ALU_OP0_NOP, tgsi_unsupported},
   9648 	[TGSI_OPCODE_BARRIER]	= { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
   9649 	[TGSI_OPCODE_ATOMUADD]	= { ALU_OP0_NOP, tgsi_unsupported},
   9650 	[TGSI_OPCODE_ATOMXCHG]	= { ALU_OP0_NOP, tgsi_unsupported},
   9651 	[TGSI_OPCODE_ATOMCAS]	= { ALU_OP0_NOP, tgsi_unsupported},
   9652 	[TGSI_OPCODE_ATOMAND]	= { ALU_OP0_NOP, tgsi_unsupported},
   9653 	[TGSI_OPCODE_ATOMOR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9654 	[TGSI_OPCODE_ATOMXOR]	= { ALU_OP0_NOP, tgsi_unsupported},
   9655 	[TGSI_OPCODE_ATOMUMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
   9656 	[TGSI_OPCODE_ATOMUMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
   9657 	[TGSI_OPCODE_ATOMIMIN]	= { ALU_OP0_NOP, tgsi_unsupported},
   9658 	[TGSI_OPCODE_ATOMIMAX]	= { ALU_OP0_NOP, tgsi_unsupported},
   9659 	[TGSI_OPCODE_TEX2]	= { FETCH_OP_SAMPLE, tgsi_tex},
   9660 	[TGSI_OPCODE_TXB2]	= { FETCH_OP_SAMPLE_LB, tgsi_tex},
   9661 	[TGSI_OPCODE_TXL2]	= { FETCH_OP_SAMPLE_L, tgsi_tex},
   9662 	[TGSI_OPCODE_IMUL_HI]	= { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
   9663 	[TGSI_OPCODE_UMUL_HI]	= { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
   9664 	[TGSI_OPCODE_TG4]	= { FETCH_OP_GATHER4, tgsi_tex},
   9665 	[TGSI_OPCODE_LODQ]	= { FETCH_OP_GET_LOD, tgsi_tex},
   9666 	[TGSI_OPCODE_IBFE]	= { ALU_OP3_BFE_INT, tgsi_op3},
   9667 	[TGSI_OPCODE_UBFE]	= { ALU_OP3_BFE_UINT, tgsi_op3},
   9668 	[TGSI_OPCODE_BFI]	= { ALU_OP0_NOP, tgsi_bfi},
   9669 	[TGSI_OPCODE_BREV]	= { ALU_OP1_BFREV_INT, tgsi_op2},
   9670 	[TGSI_OPCODE_POPC]	= { ALU_OP1_BCNT_INT, tgsi_op2},
   9671 	[TGSI_OPCODE_LSB]	= { ALU_OP1_FFBL_INT, tgsi_op2},
   9672 	[TGSI_OPCODE_IMSB]	= { ALU_OP1_FFBH_INT, tgsi_msb},
   9673 	[TGSI_OPCODE_UMSB]	= { ALU_OP1_FFBH_UINT, tgsi_msb},
   9674 	[TGSI_OPCODE_INTERP_CENTROID]	= { ALU_OP0_NOP, tgsi_interp_egcm},
   9675 	[TGSI_OPCODE_INTERP_SAMPLE]	= { ALU_OP0_NOP, tgsi_interp_egcm},
   9676 	[TGSI_OPCODE_INTERP_OFFSET]	= { ALU_OP0_NOP, tgsi_interp_egcm},
   9677 	[TGSI_OPCODE_F2D]	= { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
   9678 	[TGSI_OPCODE_D2F]	= { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
   9679 	[TGSI_OPCODE_DABS]	= { ALU_OP1_MOV, tgsi_op2_64},
   9680 	[TGSI_OPCODE_DNEG]	= { ALU_OP2_ADD_64, tgsi_dneg},
   9681 	[TGSI_OPCODE_DADD]	= { ALU_OP2_ADD_64, tgsi_op2_64},
   9682 	[TGSI_OPCODE_DMUL]	= { ALU_OP2_MUL_64, cayman_mul_double_instr},
   9683 	[TGSI_OPCODE_DDIV]	= { 0, cayman_ddiv_instr },
   9684 	[TGSI_OPCODE_DMAX]	= { ALU_OP2_MAX_64, tgsi_op2_64},
   9685 	[TGSI_OPCODE_DMIN]	= { ALU_OP2_MIN_64, tgsi_op2_64},
   9686 	[TGSI_OPCODE_DSLT]	= { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
   9687 	[TGSI_OPCODE_DSGE]	= { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
   9688 	[TGSI_OPCODE_DSEQ]	= { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
   9689 	[TGSI_OPCODE_DSNE]	= { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
   9690 	[TGSI_OPCODE_DRCP]	= { ALU_OP2_RECIP_64, cayman_emit_double_instr},
   9691 	[TGSI_OPCODE_DSQRT]	= { ALU_OP2_SQRT_64, cayman_emit_double_instr},
   9692 	[TGSI_OPCODE_DMAD]	= { ALU_OP3_FMA_64, tgsi_op3_64},
   9693 	[TGSI_OPCODE_DFMA]	= { ALU_OP3_FMA_64, tgsi_op3_64},
   9694 	[TGSI_OPCODE_DFRAC]	= { ALU_OP1_FRACT_64, tgsi_op2_64},
   9695 	[TGSI_OPCODE_DLDEXP]	= { ALU_OP2_LDEXP_64, tgsi_op2_64},
   9696 	[TGSI_OPCODE_DFRACEXP]	= { ALU_OP1_FREXP_64, tgsi_dfracexp},
   9697 	[TGSI_OPCODE_D2I]	= { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
   9698 	[TGSI_OPCODE_I2D]	= { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
   9699 	[TGSI_OPCODE_D2U]	= { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
   9700 	[TGSI_OPCODE_U2D]	= { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
   9701 	[TGSI_OPCODE_DRSQ]	= { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
   9702 	[TGSI_OPCODE_LAST]	= { ALU_OP0_NOP, tgsi_unsupported},
   9703 };
   9704