Home | History | Annotate | Download | only in compiler
      1 /*
      2  * Copyright 2009 Nicolai Hhnle <nhaehnle (at) gmail.com>
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * on the rights to use, copy, modify, merge, publish, distribute, sub
      8  * license, and/or sell copies of the Software, and to permit persons to whom
      9  * the Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
     18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     21  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
     22 
     23 #include "radeon_compiler.h"
     24 
     25 #include <stdio.h>
     26 
     27 #include "r300_reg.h"
     28 
     29 #include "radeon_compiler_util.h"
     30 #include "radeon_dataflow.h"
     31 #include "radeon_program.h"
     32 #include "radeon_program_alu.h"
     33 #include "radeon_swizzle.h"
     34 #include "radeon_emulate_branches.h"
     35 #include "radeon_emulate_loops.h"
     36 #include "radeon_remove_constants.h"
     37 
     38 /*
     39  * Take an already-setup and valid source then swizzle it appropriately to
     40  * obtain a constant ZERO or ONE source.
     41  */
     42 #define __CONST(x, y)	\
     43 	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
     44 			   t_swizzle(y),	\
     45 			   t_swizzle(y),	\
     46 			   t_swizzle(y),	\
     47 			   t_swizzle(y),	\
     48 			   t_src_class(vpi->SrcReg[x].File), \
     49 			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
     50 
     51 
     52 static unsigned long t_dst_mask(unsigned int mask)
     53 {
     54 	/* RC_MASK_* is equivalent to VSF_FLAG_* */
     55 	return mask & RC_MASK_XYZW;
     56 }
     57 
     58 static unsigned long t_dst_class(rc_register_file file)
     59 {
     60 	switch (file) {
     61 	default:
     62 		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
     63 		/* fall-through */
     64 	case RC_FILE_TEMPORARY:
     65 		return PVS_DST_REG_TEMPORARY;
     66 	case RC_FILE_OUTPUT:
     67 		return PVS_DST_REG_OUT;
     68 	case RC_FILE_ADDRESS:
     69 		return PVS_DST_REG_A0;
     70 	}
     71 }
     72 
     73 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
     74 				 struct rc_dst_register *dst)
     75 {
     76 	if (dst->File == RC_FILE_OUTPUT)
     77 		return vp->outputs[dst->Index];
     78 
     79 	return dst->Index;
     80 }
     81 
     82 static unsigned long t_src_class(rc_register_file file)
     83 {
     84 	switch (file) {
     85 	default:
     86 		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
     87 		/* fall-through */
     88 	case RC_FILE_NONE:
     89 	case RC_FILE_TEMPORARY:
     90 		return PVS_SRC_REG_TEMPORARY;
     91 	case RC_FILE_INPUT:
     92 		return PVS_SRC_REG_INPUT;
     93 	case RC_FILE_CONSTANT:
     94 		return PVS_SRC_REG_CONSTANT;
     95 	}
     96 }
     97 
     98 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
     99 {
    100 	unsigned long aclass = t_src_class(a.File);
    101 	unsigned long bclass = t_src_class(b.File);
    102 
    103 	if (aclass != bclass)
    104 		return 0;
    105 	if (aclass == PVS_SRC_REG_TEMPORARY)
    106 		return 0;
    107 
    108 	if (a.RelAddr || b.RelAddr)
    109 		return 1;
    110 	if (a.Index != b.Index)
    111 		return 1;
    112 
    113 	return 0;
    114 }
    115 
    116 static inline unsigned long t_swizzle(unsigned int swizzle)
    117 {
    118 	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
    119 	return swizzle;
    120 }
    121 
    122 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
    123 				 struct rc_src_register *src)
    124 {
    125 	if (src->File == RC_FILE_INPUT) {
    126 		assert(vp->inputs[src->Index] != -1);
    127 		return vp->inputs[src->Index];
    128 	} else {
    129 		if (src->Index < 0) {
    130 			fprintf(stderr,
    131 				"negative offsets for indirect addressing do not work.\n");
    132 			return 0;
    133 		}
    134 		return src->Index;
    135 	}
    136 }
    137 
    138 /* these two functions should probably be merged... */
    139 
    140 static unsigned long t_src(struct r300_vertex_program_code *vp,
    141 			   struct rc_src_register *src)
    142 {
    143 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
    144 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
    145 	 */
    146 	return PVS_SRC_OPERAND(t_src_index(vp, src),
    147 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
    148 			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
    149 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
    150 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
    151 			       t_src_class(src->File),
    152 			       src->Negate) |
    153 	       (src->RelAddr << 4) | (src->Abs << 3);
    154 }
    155 
    156 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
    157 				  struct rc_src_register *src)
    158 {
    159 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
    160 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
    161 	 */
    162 	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
    163 
    164 	return PVS_SRC_OPERAND(t_src_index(vp, src),
    165 			       t_swizzle(swz),
    166 			       t_swizzle(swz),
    167 			       t_swizzle(swz),
    168 			       t_swizzle(swz),
    169 			       t_src_class(src->File),
    170 			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
    171 	       (src->RelAddr << 4) | (src->Abs << 3);
    172 }
    173 
    174 static int valid_dst(struct r300_vertex_program_code *vp,
    175 			   struct rc_dst_register *dst)
    176 {
    177 	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
    178 		return 0;
    179 	} else if (dst->File == RC_FILE_ADDRESS) {
    180 		assert(dst->Index == 0);
    181 	}
    182 
    183 	return 1;
    184 }
    185 
    186 static void ei_vector1(struct r300_vertex_program_code *vp,
    187 				unsigned int hw_opcode,
    188 				struct rc_sub_instruction *vpi,
    189 				unsigned int * inst)
    190 {
    191 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
    192 				     0,
    193 				     0,
    194 				     t_dst_index(vp, &vpi->DstReg),
    195 				     t_dst_mask(vpi->DstReg.WriteMask),
    196 				     t_dst_class(vpi->DstReg.File),
    197                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
    198 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
    199 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
    200 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
    201 }
    202 
    203 static void ei_vector2(struct r300_vertex_program_code *vp,
    204 				unsigned int hw_opcode,
    205 				struct rc_sub_instruction *vpi,
    206 				unsigned int * inst)
    207 {
    208 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
    209 				     0,
    210 				     0,
    211 				     t_dst_index(vp, &vpi->DstReg),
    212 				     t_dst_mask(vpi->DstReg.WriteMask),
    213 				     t_dst_class(vpi->DstReg.File),
    214                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
    215 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
    216 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
    217 	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
    218 }
    219 
    220 static void ei_math1(struct r300_vertex_program_code *vp,
    221 				unsigned int hw_opcode,
    222 				struct rc_sub_instruction *vpi,
    223 				unsigned int * inst)
    224 {
    225 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
    226 				     1,
    227 				     0,
    228 				     t_dst_index(vp, &vpi->DstReg),
    229 				     t_dst_mask(vpi->DstReg.WriteMask),
    230 				     t_dst_class(vpi->DstReg.File),
    231                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
    232 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
    233 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
    234 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
    235 }
    236 
    237 static void ei_lit(struct r300_vertex_program_code *vp,
    238 				      struct rc_sub_instruction *vpi,
    239 				      unsigned int * inst)
    240 {
    241 	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
    242 
    243 	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
    244 				     1,
    245 				     0,
    246 				     t_dst_index(vp, &vpi->DstReg),
    247 				     t_dst_mask(vpi->DstReg.WriteMask),
    248 				     t_dst_class(vpi->DstReg.File),
    249                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
    250 	/* NOTE: Users swizzling might not work. */
    251 	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
    252 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
    253 				  PVS_SRC_SELECT_FORCE_0,	// Z
    254 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
    255 				  t_src_class(vpi->SrcReg[0].File),
    256 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
    257 	    (vpi->SrcReg[0].RelAddr << 4);
    258 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
    259 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
    260 				  PVS_SRC_SELECT_FORCE_0,	// Z
    261 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
    262 				  t_src_class(vpi->SrcReg[0].File),
    263 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
    264 	    (vpi->SrcReg[0].RelAddr << 4);
    265 	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
    266 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
    267 				  PVS_SRC_SELECT_FORCE_0,	// Z
    268 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
    269 				  t_src_class(vpi->SrcReg[0].File),
    270 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
    271 	    (vpi->SrcReg[0].RelAddr << 4);
    272 }
    273 
    274 static void ei_mad(struct r300_vertex_program_code *vp,
    275 				      struct rc_sub_instruction *vpi,
    276 				      unsigned int * inst)
    277 {
    278 	unsigned int i;
    279 	/* Remarks about hardware limitations of MAD
    280 	 * (please preserve this comment, as this information is _NOT_
    281 	 * in the documentation provided by AMD).
    282 	 *
    283 	 * As described in the documentation, MAD with three unique temporary
    284 	 * source registers requires the use of the macro version.
    285 	 *
    286 	 * However (and this is not mentioned in the documentation), apparently
    287 	 * the macro version is _NOT_ a full superset of the normal version.
    288 	 * In particular, the macro version does not always work when relative
    289 	 * addressing is used in the source operands.
    290 	 *
    291 	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
    292 	 * assembly shader path when using medium quality animations
    293 	 * (i.e. animations with matrix blending instead of quaternion blending).
    294 	 *
    295 	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
    296 	 * test for this issue - for some reason, it is possible to have vertex
    297 	 * programs whose prefix is *exactly* the same as the prefix of the
    298 	 * offending program in Sauerbraten up to the offending instruction
    299 	 * without causing any trouble.
    300 	 *
    301 	 * Bottom line: Only use the macro version only when really necessary;
    302 	 * according to AMD docs, this should improve performance by one clock
    303 	 * as a nice side bonus.
    304 	 */
    305 	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
    306 	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
    307 	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
    308 	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
    309 	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
    310 	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
    311 		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
    312 				0,
    313 				1,
    314 				t_dst_index(vp, &vpi->DstReg),
    315 				t_dst_mask(vpi->DstReg.WriteMask),
    316 				t_dst_class(vpi->DstReg.File),
    317                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
    318 	} else {
    319 		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
    320 				0,
    321 				0,
    322 				t_dst_index(vp, &vpi->DstReg),
    323 				t_dst_mask(vpi->DstReg.WriteMask),
    324 				t_dst_class(vpi->DstReg.File),
    325                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
    326 
    327 		/* Arguments with constant swizzles still count as a unique
    328 		 * temporary, so we should make sure these arguments share a
    329 		 * register index with one of the other arguments. */
    330 		for (i = 0; i < 3; i++) {
    331 			unsigned int j;
    332 			if (vpi->SrcReg[i].File != RC_FILE_NONE)
    333 				continue;
    334 
    335 			for (j = 0; j < 3; j++) {
    336 				if (i != j) {
    337 					vpi->SrcReg[i].Index =
    338 						vpi->SrcReg[j].Index;
    339 					break;
    340 				}
    341 			}
    342 		}
    343 	}
    344 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
    345 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
    346 	inst[3] = t_src(vp, &vpi->SrcReg[2]);
    347 }
    348 
    349 static void ei_pow(struct r300_vertex_program_code *vp,
    350 				      struct rc_sub_instruction *vpi,
    351 				      unsigned int * inst)
    352 {
    353 	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
    354 				     1,
    355 				     0,
    356 				     t_dst_index(vp, &vpi->DstReg),
    357 				     t_dst_mask(vpi->DstReg.WriteMask),
    358 				     t_dst_class(vpi->DstReg.File),
    359                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
    360 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
    361 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
    362 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
    363 }
    364 
    365 static void translate_vertex_program(struct radeon_compiler *c, void *user)
    366 {
    367 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
    368 	struct rc_instruction *rci;
    369 
    370 	unsigned loops[R500_PVS_MAX_LOOP_DEPTH];
    371 	unsigned loop_depth = 0;
    372 
    373 	compiler->code->pos_end = 0;	/* Not supported yet */
    374 	compiler->code->length = 0;
    375 	compiler->code->num_temporaries = 0;
    376 
    377 	compiler->SetHwInputOutput(compiler);
    378 
    379 	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
    380 		struct rc_sub_instruction *vpi = &rci->U.I;
    381 		unsigned int *inst = compiler->code->body.d + compiler->code->length;
    382 		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
    383 
    384 		/* Skip instructions writing to non-existing destination */
    385 		if (!valid_dst(compiler->code, &vpi->DstReg))
    386 			continue;
    387 
    388 		if (info->HasDstReg) {
    389 			/* Neither is Saturate. */
    390 			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
    391 				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
    392 					 "modifier (yet).\n");
    393 			}
    394 		}
    395 
    396 		if (compiler->code->length >= c->max_alu_insts * 4) {
    397 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
    398 			return;
    399 		}
    400 
    401 		assert(compiler->Base.is_r500 ||
    402 		       (vpi->Opcode != RC_OPCODE_SEQ &&
    403 			vpi->Opcode != RC_OPCODE_SNE));
    404 
    405 		switch (vpi->Opcode) {
    406 		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
    407 		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
    408 		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
    409 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
    410 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
    411 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
    412 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
    413 		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
    414 		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
    415 		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
    416 		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
    417 		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
    418 		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
    419 		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
    420 		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
    421 		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
    422 		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
    423 		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
    424 		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
    425 		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
    426 		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
    427 		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
    428 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
    429 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
    430 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
    431 		case RC_OPCODE_BGNLOOP:
    432 		{
    433 			if ((!compiler->Base.is_r500
    434 				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
    435 				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
    436 				rc_error(&compiler->Base,
    437 						"Loops are nested too deep.");
    438 				return;
    439 			}
    440 			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
    441 			break;
    442 		}
    443 		case RC_OPCODE_ENDLOOP:
    444 		{
    445 			unsigned int act_addr;
    446 			unsigned int last_addr;
    447 			unsigned int ret_addr;
    448 
    449 			ret_addr = loops[--loop_depth];
    450 			act_addr = ret_addr - 1;
    451 			last_addr = (compiler->code->length / 4) - 1;
    452 
    453 			if (loop_depth >= R300_VS_MAX_FC_OPS) {
    454 				rc_error(&compiler->Base,
    455 					"Too many flow control instructions.");
    456 				return;
    457 			}
    458 			if (compiler->Base.is_r500) {
    459 				compiler->code->fc_op_addrs.r500
    460 					[compiler->code->num_fc_ops].lw =
    461 					R500_PVS_FC_ACT_ADRS(act_addr)
    462 					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
    463 					;
    464 				compiler->code->fc_op_addrs.r500
    465 					[compiler->code->num_fc_ops].uw =
    466 					R500_PVS_FC_LAST_INST(last_addr)
    467 					| R500_PVS_FC_RTN_INST(ret_addr)
    468 					;
    469 			} else {
    470 				compiler->code->fc_op_addrs.r300
    471 					[compiler->code->num_fc_ops] =
    472 					R300_PVS_FC_ACT_ADRS(act_addr)
    473 					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
    474 					| R300_PVS_FC_LAST_INST(last_addr)
    475 					| R300_PVS_FC_RTN_INST(ret_addr)
    476 					;
    477 			}
    478 			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
    479 				R300_PVS_FC_LOOP_INIT_VAL(0x0)
    480 				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
    481 				;
    482 			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
    483 						compiler->code->num_fc_ops);
    484 			compiler->code->num_fc_ops++;
    485 
    486 			break;
    487 		}
    488 
    489 		case RC_ME_PRED_SET_CLR:
    490 			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
    491 			break;
    492 
    493 		case RC_ME_PRED_SET_INV:
    494 			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
    495 			break;
    496 
    497 		case RC_ME_PRED_SET_POP:
    498 			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
    499 			break;
    500 
    501 		case RC_ME_PRED_SET_RESTORE:
    502 			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
    503 			break;
    504 
    505 		case RC_ME_PRED_SEQ:
    506 			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
    507 			break;
    508 
    509 		case RC_ME_PRED_SNEQ:
    510 			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
    511 			break;
    512 
    513 		case RC_VE_PRED_SNEQ_PUSH:
    514 			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
    515 								vpi, inst);
    516 			break;
    517 
    518 		default:
    519 			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
    520 			return;
    521 		}
    522 
    523 		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
    524 			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
    525 						<< PVS_DST_PRED_ENABLE_SHIFT);
    526 			if (vpi->DstReg.Pred == RC_PRED_SET) {
    527 				inst[0] |= (PVS_DST_PRED_SENSE_MASK
    528 						<< PVS_DST_PRED_SENSE_SHIFT);
    529 			}
    530 		}
    531 
    532 		/* Update the number of temporaries. */
    533 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
    534 		    vpi->DstReg.Index >= compiler->code->num_temporaries)
    535 			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
    536 
    537 		for (unsigned i = 0; i < info->NumSrcRegs; i++)
    538 			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
    539 			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
    540 				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
    541 
    542 		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
    543 			rc_error(&compiler->Base, "Too many temporaries.\n");
    544 			return;
    545 		}
    546 
    547 		compiler->code->length += 4;
    548 
    549 		if (compiler->Base.Error)
    550 			return;
    551 	}
    552 }
    553 
    554 struct temporary_allocation {
    555 	unsigned int Allocated:1;
    556 	unsigned int HwTemp:15;
    557 	struct rc_instruction * LastRead;
    558 };
    559 
    560 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
    561 {
    562 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
    563 	struct rc_instruction *inst;
    564 	struct rc_instruction *end_loop = NULL;
    565 	unsigned int num_orig_temps = 0;
    566 	char hwtemps[RC_REGISTER_MAX_INDEX];
    567 	struct temporary_allocation * ta;
    568 	unsigned int i, j;
    569 
    570 	memset(hwtemps, 0, sizeof(hwtemps));
    571 
    572 	rc_recompute_ips(c);
    573 
    574 	/* Pass 1: Count original temporaries. */
    575 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
    576 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
    577 
    578 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
    579 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
    580 				if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
    581 					num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
    582 			}
    583 		}
    584 
    585 		if (opcode->HasDstReg) {
    586 			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
    587 				if (inst->U.I.DstReg.Index >= num_orig_temps)
    588 					num_orig_temps = inst->U.I.DstReg.Index + 1;
    589 			}
    590 		}
    591 	}
    592 
    593 	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
    594 			sizeof(struct temporary_allocation) * num_orig_temps);
    595 	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
    596 
    597 	/* Pass 2: Determine original temporary lifetimes */
    598 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
    599 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
    600 		/* Instructions inside of loops need to use the ENDLOOP
    601 		 * instruction as their LastRead. */
    602 		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
    603 			int endloops = 1;
    604 			struct rc_instruction * ptr;
    605 			for(ptr = inst->Next;
    606 				ptr != &compiler->Base.Program.Instructions;
    607 							ptr = ptr->Next){
    608 				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
    609 					endloops++;
    610 				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
    611 					endloops--;
    612 					if (endloops <= 0) {
    613 						end_loop = ptr;
    614 						break;
    615 					}
    616 				}
    617 			}
    618 		}
    619 
    620 		if (inst == end_loop) {
    621 			end_loop = NULL;
    622 			continue;
    623 		}
    624 
    625 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
    626 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
    627 				ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
    628 			}
    629 		}
    630 	}
    631 
    632 	/* Pass 3: Register allocation */
    633 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
    634 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
    635 
    636 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
    637 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
    638 				unsigned int orig = inst->U.I.SrcReg[i].Index;
    639 				inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
    640 
    641 				if (ta[orig].Allocated && inst == ta[orig].LastRead)
    642 					hwtemps[ta[orig].HwTemp] = 0;
    643 			}
    644 		}
    645 
    646 		if (opcode->HasDstReg) {
    647 			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
    648 				unsigned int orig = inst->U.I.DstReg.Index;
    649 
    650 				if (!ta[orig].Allocated) {
    651 					for(j = 0; j < c->max_temp_regs; ++j) {
    652 						if (!hwtemps[j])
    653 							break;
    654 					}
    655 					ta[orig].Allocated = 1;
    656 					ta[orig].HwTemp = j;
    657 					hwtemps[ta[orig].HwTemp] = 1;
    658 				}
    659 
    660 				inst->U.I.DstReg.Index = ta[orig].HwTemp;
    661 			}
    662 		}
    663 	}
    664 }
    665 
    666 /**
    667  * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
    668  * and the Saturate opcode modifier. Only Absolute is currently transformed.
    669  */
    670 static int transform_nonnative_modifiers(
    671 	struct radeon_compiler *c,
    672 	struct rc_instruction *inst,
    673 	void* unused)
    674 {
    675 	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
    676 	unsigned i;
    677 
    678 	/* Transform ABS(a) to MAX(a, -a). */
    679 	for (i = 0; i < opcode->NumSrcRegs; i++) {
    680 		if (inst->U.I.SrcReg[i].Abs) {
    681 			struct rc_instruction *new_inst;
    682 			unsigned temp;
    683 
    684 			inst->U.I.SrcReg[i].Abs = 0;
    685 
    686 			temp = rc_find_free_temporary(c);
    687 
    688 			new_inst = rc_insert_new_instruction(c, inst->Prev);
    689 			new_inst->U.I.Opcode = RC_OPCODE_MAX;
    690 			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
    691 			new_inst->U.I.DstReg.Index = temp;
    692 			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
    693 			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
    694 			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
    695 
    696 			memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
    697 			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
    698 			inst->U.I.SrcReg[i].Index = temp;
    699 			inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
    700 		}
    701 	}
    702 	return 1;
    703 }
    704 
    705 /**
    706  * Vertex engine cannot read two inputs or two constants at the same time.
    707  * Introduce intermediate MOVs to temporary registers to account for this.
    708  */
    709 static int transform_source_conflicts(
    710 	struct radeon_compiler *c,
    711 	struct rc_instruction* inst,
    712 	void* unused)
    713 {
    714 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
    715 
    716 	if (opcode->NumSrcRegs == 3) {
    717 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
    718 		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
    719 			int tmpreg = rc_find_free_temporary(c);
    720 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
    721 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
    722 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
    723 			inst_mov->U.I.DstReg.Index = tmpreg;
    724 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
    725 
    726 			reset_srcreg(&inst->U.I.SrcReg[2]);
    727 			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
    728 			inst->U.I.SrcReg[2].Index = tmpreg;
    729 		}
    730 	}
    731 
    732 	if (opcode->NumSrcRegs >= 2) {
    733 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
    734 			int tmpreg = rc_find_free_temporary(c);
    735 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
    736 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
    737 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
    738 			inst_mov->U.I.DstReg.Index = tmpreg;
    739 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
    740 
    741 			reset_srcreg(&inst->U.I.SrcReg[1]);
    742 			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
    743 			inst->U.I.SrcReg[1].Index = tmpreg;
    744 		}
    745 	}
    746 
    747 	return 1;
    748 }
    749 
    750 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
    751 {
    752 	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
    753 	int i;
    754 
    755 	for(i = 0; i < 32; ++i) {
    756 		if ((compiler->RequiredOutputs & (1 << i)) &&
    757 		    !(compiler->Base.Program.OutputsWritten & (1 << i))) {
    758 			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
    759 			inst->U.I.Opcode = RC_OPCODE_MOV;
    760 
    761 			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
    762 			inst->U.I.DstReg.Index = i;
    763 			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
    764 
    765 			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
    766 			inst->U.I.SrcReg[0].Index = 0;
    767 			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
    768 
    769 			compiler->Base.Program.OutputsWritten |= 1 << i;
    770 		}
    771 	}
    772 }
    773 
    774 static void dataflow_outputs_mark_used(void * userdata, void * data,
    775 		void (*callback)(void *, unsigned int, unsigned int))
    776 {
    777 	struct r300_vertex_program_compiler * c = userdata;
    778 	int i;
    779 
    780 	for(i = 0; i < 32; ++i) {
    781 		if (c->RequiredOutputs & (1 << i))
    782 			callback(data, i, RC_MASK_XYZW);
    783 	}
    784 }
    785 
    786 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
    787 {
    788 	(void) opcode;
    789 	(void) reg;
    790 
    791 	return 1;
    792 }
    793 
    794 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
    795 					  struct rc_instruction *arl,
    796 					  struct rc_instruction *end,
    797 					  int min_offset)
    798 {
    799 	struct rc_instruction *inst, *add;
    800 	unsigned const_swizzle;
    801 
    802 	/* Transform ARL/ARR */
    803 	add = rc_insert_new_instruction(&c->Base, arl->Prev);
    804 	add->U.I.Opcode = RC_OPCODE_ADD;
    805 	add->U.I.DstReg.File = RC_FILE_TEMPORARY;
    806 	add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
    807 	add->U.I.DstReg.WriteMask = RC_MASK_X;
    808 	add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
    809 	add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
    810 	add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
    811 								     min_offset, &const_swizzle);
    812 	add->U.I.SrcReg[1].Swizzle = const_swizzle;
    813 
    814 	arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
    815 	arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
    816 	arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
    817 
    818 	/* Rewrite offsets up to and excluding inst. */
    819 	for (inst = arl->Next; inst != end; inst = inst->Next) {
    820 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
    821 
    822 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
    823 			if (inst->U.I.SrcReg[i].RelAddr)
    824 				inst->U.I.SrcReg[i].Index -= min_offset;
    825 	}
    826 }
    827 
    828 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
    829 {
    830 	struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
    831 	struct rc_instruction *inst, *lastARL = NULL;
    832 	int min_offset = 0;
    833 
    834 	for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
    835 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
    836 
    837 		if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
    838 			if (lastARL != NULL && min_offset < 0)
    839 				transform_negative_addressing(c, lastARL, inst, min_offset);
    840 
    841 			lastARL = inst;
    842 			min_offset = 0;
    843 			continue;
    844 		}
    845 
    846 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
    847 			if (inst->U.I.SrcReg[i].RelAddr &&
    848 			    inst->U.I.SrcReg[i].Index < 0) {
    849 				/* ARL must precede any indirect addressing. */
    850 				if (!lastARL) {
    851 					rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
    852 					return;
    853 				}
    854 
    855 				if (inst->U.I.SrcReg[i].Index < min_offset)
    856 					min_offset = inst->U.I.SrcReg[i].Index;
    857 			}
    858 		}
    859 	}
    860 
    861 	if (lastARL != NULL && min_offset < 0)
    862 		transform_negative_addressing(c, lastARL, inst, min_offset);
    863 }
    864 
    865 struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
    866 	.IsNative = &swizzle_is_native,
    867 	.Split = 0 /* should never be called */
    868 };
    869 
    870 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
    871 {
    872 	int is_r500 = c->Base.is_r500;
    873 	int opt = !c->Base.disable_optimizations;
    874 
    875 	/* Lists of instruction transformations. */
    876 	struct radeon_program_transformation alu_rewrite_r500[] = {
    877 		{ &r300_transform_vertex_alu, 0 },
    878 		{ &r300_transform_trig_scale_vertex, 0 },
    879 		{ 0, 0 }
    880 	};
    881 
    882 	struct radeon_program_transformation alu_rewrite_r300[] = {
    883 		{ &r300_transform_vertex_alu, 0 },
    884 		{ &r300_transform_trig_simple, 0 },
    885 		{ 0, 0 }
    886 	};
    887 
    888 	/* Note: These passes have to be done seperately from ALU rewrite,
    889 	 * otherwise non-native ALU instructions with source conflits
    890 	 * or non-native modifiers will not be treated properly.
    891 	 */
    892 	struct radeon_program_transformation emulate_modifiers[] = {
    893 		{ &transform_nonnative_modifiers, 0 },
    894 		{ 0, 0 }
    895 	};
    896 
    897 	struct radeon_program_transformation resolve_src_conflicts[] = {
    898 		{ &transform_source_conflicts, 0 },
    899 		{ 0, 0 }
    900 	};
    901 
    902 	/* List of compiler passes. */
    903 	struct radeon_compiler_pass vs_list[] = {
    904 		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
    905 		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
    906 		{"emulate branches",		1, !is_r500,	rc_emulate_branches,		NULL},
    907 		{"emulate negative addressing", 1, 1,		rc_emulate_negative_addressing,	NULL},
    908 		{"native rewrite",		1, is_r500,	rc_local_transform,		alu_rewrite_r500},
    909 		{"native rewrite",		1, !is_r500,	rc_local_transform,		alu_rewrite_r300},
    910 		{"emulate modifiers",		1, !is_r500,	rc_local_transform,		emulate_modifiers},
    911 		{"deadcode",			1, opt,		rc_dataflow_deadcode,		dataflow_outputs_mark_used},
    912 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
    913 		/* This pass must be done after optimizations. */
    914 		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
    915 		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
    916 		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
    917 		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
    918 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
    919 		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
    920 		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
    921 		{NULL, 0, 0, NULL, NULL}
    922 	};
    923 
    924 	c->Base.type = RC_VERTEX_PROGRAM;
    925 	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
    926 
    927 	rc_run_compiler(&c->Base, vs_list);
    928 
    929 	c->code->InputsRead = c->Base.Program.InputsRead;
    930 	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
    931 	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
    932 }
    933