Home | History | Annotate | Download | only in compiler
      1 /*
      2  * Copyright (C) 2008 Nicolai Haehnle.
      3  *
      4  * All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining
      7  * a copy of this software and associated documentation files (the
      8  * "Software"), to deal in the Software without restriction, including
      9  * without limitation the rights to use, copy, modify, merge, publish,
     10  * distribute, sublicense, and/or sell copies of the Software, and to
     11  * permit persons to whom the Software is furnished to do so, subject to
     12  * the following conditions:
     13  *
     14  * The above copyright notice and this permission notice (including the
     15  * next paragraph) shall be included in all copies or substantial
     16  * portions of the Software.
     17  *
     18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
     22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     25  *
     26  */
     27 
     28 /**
     29  * @file
     30  *
     31  * Shareable transformations that transform "special" ALU instructions
     32  * into ALU instructions that are supported by hardware.
     33  *
     34  */
     35 
     36 #include "radeon_program_alu.h"
     37 
     38 #include "radeon_compiler.h"
     39 #include "radeon_compiler_util.h"
     40 
     41 
     42 static struct rc_instruction *emit1(
     43 	struct radeon_compiler * c, struct rc_instruction * after,
     44 	rc_opcode Opcode, struct rc_sub_instruction * base,
     45 	struct rc_dst_register DstReg, struct rc_src_register SrcReg)
     46 {
     47 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
     48 
     49 	if (base) {
     50 		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
     51 	}
     52 
     53 	fpi->U.I.Opcode = Opcode;
     54 	fpi->U.I.DstReg = DstReg;
     55 	fpi->U.I.SrcReg[0] = SrcReg;
     56 	return fpi;
     57 }
     58 
     59 static struct rc_instruction *emit2(
     60 	struct radeon_compiler * c, struct rc_instruction * after,
     61 	rc_opcode Opcode, struct rc_sub_instruction * base,
     62 	struct rc_dst_register DstReg,
     63 	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
     64 {
     65 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
     66 
     67 	if (base) {
     68 		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
     69 	}
     70 
     71 	fpi->U.I.Opcode = Opcode;
     72 	fpi->U.I.DstReg = DstReg;
     73 	fpi->U.I.SrcReg[0] = SrcReg0;
     74 	fpi->U.I.SrcReg[1] = SrcReg1;
     75 	return fpi;
     76 }
     77 
     78 static struct rc_instruction *emit3(
     79 	struct radeon_compiler * c, struct rc_instruction * after,
     80 	rc_opcode Opcode, struct rc_sub_instruction * base,
     81 	struct rc_dst_register DstReg,
     82 	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
     83 	struct rc_src_register SrcReg2)
     84 {
     85 	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
     86 
     87 	if (base) {
     88 		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
     89 	}
     90 
     91 	fpi->U.I.Opcode = Opcode;
     92 	fpi->U.I.DstReg = DstReg;
     93 	fpi->U.I.SrcReg[0] = SrcReg0;
     94 	fpi->U.I.SrcReg[1] = SrcReg1;
     95 	fpi->U.I.SrcReg[2] = SrcReg2;
     96 	return fpi;
     97 }
     98 
     99 static struct rc_dst_register dstregtmpmask(int index, int mask)
    100 {
    101 	struct rc_dst_register dst = {0, 0, 0};
    102 	dst.File = RC_FILE_TEMPORARY;
    103 	dst.Index = index;
    104 	dst.WriteMask = mask;
    105 	return dst;
    106 }
    107 
    108 static const struct rc_src_register builtin_zero = {
    109 	.File = RC_FILE_NONE,
    110 	.Index = 0,
    111 	.Swizzle = RC_SWIZZLE_0000
    112 };
    113 static const struct rc_src_register builtin_one = {
    114 	.File = RC_FILE_NONE,
    115 	.Index = 0,
    116 	.Swizzle = RC_SWIZZLE_1111
    117 };
    118 
    119 static const struct rc_src_register builtin_half = {
    120 	.File = RC_FILE_NONE,
    121 	.Index = 0,
    122 	.Swizzle = RC_SWIZZLE_HHHH
    123 };
    124 
    125 static const struct rc_src_register srcreg_undefined = {
    126 	.File = RC_FILE_NONE,
    127 	.Index = 0,
    128 	.Swizzle = RC_SWIZZLE_XYZW
    129 };
    130 
    131 static struct rc_src_register srcreg(int file, int index)
    132 {
    133 	struct rc_src_register src = srcreg_undefined;
    134 	src.File = file;
    135 	src.Index = index;
    136 	return src;
    137 }
    138 
    139 static struct rc_src_register srcregswz(int file, int index, int swz)
    140 {
    141 	struct rc_src_register src = srcreg_undefined;
    142 	src.File = file;
    143 	src.Index = index;
    144 	src.Swizzle = swz;
    145 	return src;
    146 }
    147 
    148 static struct rc_src_register absolute(struct rc_src_register reg)
    149 {
    150 	struct rc_src_register newreg = reg;
    151 	newreg.Abs = 1;
    152 	newreg.Negate = RC_MASK_NONE;
    153 	return newreg;
    154 }
    155 
    156 static struct rc_src_register negate(struct rc_src_register reg)
    157 {
    158 	struct rc_src_register newreg = reg;
    159 	newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
    160 	return newreg;
    161 }
    162 
    163 static struct rc_src_register swizzle(struct rc_src_register reg,
    164 		rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
    165 {
    166 	struct rc_src_register swizzled = reg;
    167 	swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
    168 	return swizzled;
    169 }
    170 
    171 static struct rc_src_register swizzle_smear(struct rc_src_register reg,
    172 		rc_swizzle x)
    173 {
    174 	return swizzle(reg, x, x, x, x);
    175 }
    176 
    177 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
    178 {
    179 	return swizzle_smear(reg, RC_SWIZZLE_X);
    180 }
    181 
    182 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
    183 {
    184 	return swizzle_smear(reg, RC_SWIZZLE_Y);
    185 }
    186 
    187 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
    188 {
    189 	return swizzle_smear(reg, RC_SWIZZLE_Z);
    190 }
    191 
    192 static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
    193 {
    194 	return swizzle_smear(reg, RC_SWIZZLE_W);
    195 }
    196 
    197 static int is_dst_safe_to_reuse(struct rc_instruction *inst)
    198 {
    199 	const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
    200 	unsigned i;
    201 
    202 	assert(info->HasDstReg);
    203 
    204 	if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
    205 		return 0;
    206 
    207 	for (i = 0; i < info->NumSrcRegs; i++) {
    208 		if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
    209 		    inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
    210 			return 0;
    211 	}
    212 
    213 	return 1;
    214 }
    215 
    216 static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
    217 					       struct rc_instruction *inst)
    218 {
    219 	unsigned tmp;
    220 
    221 	if (is_dst_safe_to_reuse(inst))
    222 		tmp = inst->U.I.DstReg.Index;
    223 	else
    224 		tmp = rc_find_free_temporary(c);
    225 
    226 	return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
    227 }
    228 
    229 static void transform_ABS(struct radeon_compiler* c,
    230 	struct rc_instruction* inst)
    231 {
    232 	struct rc_src_register src = inst->U.I.SrcReg[0];
    233 	src.Abs = 1;
    234 	src.Negate = RC_MASK_NONE;
    235 	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
    236 	rc_remove_instruction(inst);
    237 }
    238 
    239 static void transform_CEIL(struct radeon_compiler* c,
    240 	struct rc_instruction* inst)
    241 {
    242 	/* Assuming:
    243 	 *     ceil(x) = -floor(-x)
    244 	 *
    245 	 * After inlining floor:
    246 	 *     ceil(x) = -(-x-frac(-x))
    247 	 *
    248 	 * After simplification:
    249 	 *     ceil(x) = x+frac(-x)
    250 	 */
    251 
    252 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    253 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
    254 	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
    255 		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
    256 	rc_remove_instruction(inst);
    257 }
    258 
    259 static void transform_CLAMP(struct radeon_compiler *c,
    260 	struct rc_instruction *inst)
    261 {
    262 	/* CLAMP dst, src, min, max
    263 	 *    into:
    264 	 * MIN tmp, src, max
    265 	 * MAX dst, tmp, min
    266 	 */
    267 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    268 	emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
    269 		inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
    270 	emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
    271 		srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
    272 	rc_remove_instruction(inst);
    273 }
    274 
    275 static void transform_DP2(struct radeon_compiler* c,
    276 	struct rc_instruction* inst)
    277 {
    278 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
    279 	struct rc_src_register src1 = inst->U.I.SrcReg[1];
    280 	src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
    281 	src0.Swizzle &= ~(63 << (3 * 2));
    282 	src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
    283 	src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
    284 	src1.Swizzle &= ~(63 << (3 * 2));
    285 	src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
    286 	emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
    287 	rc_remove_instruction(inst);
    288 }
    289 
    290 static void transform_DPH(struct radeon_compiler* c,
    291 	struct rc_instruction* inst)
    292 {
    293 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
    294 	src0.Negate &= ~RC_MASK_W;
    295 	src0.Swizzle &= ~(7 << (3 * 3));
    296 	src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
    297 	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
    298 	rc_remove_instruction(inst);
    299 }
    300 
    301 /**
    302  * [1, src0.y*src1.y, src0.z, src1.w]
    303  * So basically MUL with lotsa swizzling.
    304  */
    305 static void transform_DST(struct radeon_compiler* c,
    306 	struct rc_instruction* inst)
    307 {
    308 	emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
    309 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
    310 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
    311 	rc_remove_instruction(inst);
    312 }
    313 
    314 static void transform_FLR(struct radeon_compiler* c,
    315 	struct rc_instruction* inst)
    316 {
    317 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    318 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
    319 	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
    320 		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
    321 	rc_remove_instruction(inst);
    322 }
    323 
    324 static void transform_TRUNC(struct radeon_compiler* c,
    325 	struct rc_instruction* inst)
    326 {
    327 	/* Definition of trunc:
    328 	 *   trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
    329 	 *
    330 	 * The multiplication by sgn(x) can be simplified using CMP:
    331 	 *   y * sgn(x) = (x < 0 ? -y : y)
    332 	 */
    333 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    334 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));
    335 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),
    336 	      negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
    337 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],
    338 	      negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));
    339 	rc_remove_instruction(inst);
    340 }
    341 
    342 /**
    343  * Definition of LIT (from ARB_fragment_program):
    344  *
    345  *  tmp = VectorLoad(op0);
    346  *  if (tmp.x < 0) tmp.x = 0;
    347  *  if (tmp.y < 0) tmp.y = 0;
    348  *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
    349  *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
    350  *  result.x = 1.0;
    351  *  result.y = tmp.x;
    352  *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
    353  *  result.w = 1.0;
    354  *
    355  * The longest path of computation is the one leading to result.z,
    356  * consisting of 5 operations. This implementation of LIT takes
    357  * 5 slots, if the subsequent optimization passes are clever enough
    358  * to pair instructions correctly.
    359  */
    360 static void transform_LIT(struct radeon_compiler* c,
    361 	struct rc_instruction* inst)
    362 {
    363 	unsigned int constant;
    364 	unsigned int constant_swizzle;
    365 	unsigned int temp;
    366 	struct rc_src_register srctemp;
    367 
    368 	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
    369 
    370 	if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
    371 		struct rc_instruction * inst_mov;
    372 
    373 		inst_mov = emit1(c, inst,
    374 			RC_OPCODE_MOV, 0, inst->U.I.DstReg,
    375 			srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
    376 
    377 		inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
    378 		inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
    379 		inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
    380 	}
    381 
    382 	temp = inst->U.I.DstReg.Index;
    383 	srctemp = srcreg(RC_FILE_TEMPORARY, temp);
    384 
    385 	/* tmp.x = max(0.0, Src.x); */
    386 	/* tmp.y = max(0.0, Src.y); */
    387 	/* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
    388 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
    389 		dstregtmpmask(temp, RC_MASK_XYW),
    390 		inst->U.I.SrcReg[0],
    391 		swizzle(srcreg(RC_FILE_CONSTANT, constant),
    392 			RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
    393 	emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
    394 		dstregtmpmask(temp, RC_MASK_Z),
    395 		swizzle_wwww(srctemp),
    396 		negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
    397 
    398 	/* tmp.w = Pow(tmp.y, tmp.w) */
    399 	emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
    400 		dstregtmpmask(temp, RC_MASK_W),
    401 		swizzle_yyyy(srctemp));
    402 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
    403 		dstregtmpmask(temp, RC_MASK_W),
    404 		swizzle_wwww(srctemp),
    405 		swizzle_zzzz(srctemp));
    406 	emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
    407 		dstregtmpmask(temp, RC_MASK_W),
    408 		swizzle_wwww(srctemp));
    409 
    410 	/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
    411 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
    412 		dstregtmpmask(temp, RC_MASK_Z),
    413 		negate(swizzle_xxxx(srctemp)),
    414 		swizzle_wwww(srctemp),
    415 		builtin_zero);
    416 
    417 	/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
    418 	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
    419 		dstregtmpmask(temp, RC_MASK_XYW),
    420 		swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
    421 
    422 	rc_remove_instruction(inst);
    423 }
    424 
    425 static void transform_LRP(struct radeon_compiler* c,
    426 	struct rc_instruction* inst)
    427 {
    428 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    429 
    430 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
    431 		dst,
    432 		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
    433 	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
    434 		inst->U.I.DstReg,
    435 		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
    436 
    437 	rc_remove_instruction(inst);
    438 }
    439 
    440 static void transform_POW(struct radeon_compiler* c,
    441 	struct rc_instruction* inst)
    442 {
    443 	struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
    444 	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
    445 	tempdst.WriteMask = RC_MASK_W;
    446 	tempsrc.Swizzle = RC_SWIZZLE_WWWW;
    447 
    448 	emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
    449 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
    450 	emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
    451 
    452 	rc_remove_instruction(inst);
    453 }
    454 
    455 /* dst = ROUND(src) :
    456  *   add = src + .5
    457  *   frac = FRC(add)
    458  *   dst = add - frac
    459  *
    460  * According to the GLSL spec, the implementor can decide which way to round
    461  * when the fraction is .5.  We round down for .5.
    462  *
    463  */
    464 static void transform_ROUND(struct radeon_compiler* c,
    465 	struct rc_instruction* inst)
    466 {
    467 	unsigned int mask = inst->U.I.DstReg.WriteMask;
    468 	unsigned int frac_index, add_index;
    469 	struct rc_dst_register frac_dst, add_dst;
    470 	struct rc_src_register frac_src, add_src;
    471 
    472 	/* add = src + .5 */
    473 	add_index = rc_find_free_temporary(c);
    474 	add_dst = dstregtmpmask(add_index, mask);
    475 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
    476 								builtin_half);
    477 	add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
    478 
    479 
    480 	/* frac = FRC(add) */
    481 	frac_index = rc_find_free_temporary(c);
    482 	frac_dst = dstregtmpmask(frac_index, mask);
    483 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
    484 	frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
    485 
    486 	/* dst = add - frac */
    487 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
    488 						add_src, negate(frac_src));
    489 	rc_remove_instruction(inst);
    490 }
    491 
    492 static void transform_RSQ(struct radeon_compiler* c,
    493 	struct rc_instruction* inst)
    494 {
    495 	inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
    496 }
    497 
    498 static void transform_SEQ(struct radeon_compiler* c,
    499 	struct rc_instruction* inst)
    500 {
    501 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    502 
    503 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
    504 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
    505 		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
    506 
    507 	rc_remove_instruction(inst);
    508 }
    509 
    510 static void transform_SFL(struct radeon_compiler* c,
    511 	struct rc_instruction* inst)
    512 {
    513 	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
    514 	rc_remove_instruction(inst);
    515 }
    516 
    517 static void transform_SGE(struct radeon_compiler* c,
    518 	struct rc_instruction* inst)
    519 {
    520 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    521 
    522 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
    523 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
    524 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
    525 
    526 	rc_remove_instruction(inst);
    527 }
    528 
    529 static void transform_SGT(struct radeon_compiler* c,
    530 	struct rc_instruction* inst)
    531 {
    532 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    533 
    534 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
    535 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
    536 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
    537 
    538 	rc_remove_instruction(inst);
    539 }
    540 
    541 static void transform_SLE(struct radeon_compiler* c,
    542 	struct rc_instruction* inst)
    543 {
    544 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    545 
    546 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
    547 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
    548 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
    549 
    550 	rc_remove_instruction(inst);
    551 }
    552 
    553 static void transform_SLT(struct radeon_compiler* c,
    554 	struct rc_instruction* inst)
    555 {
    556 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    557 
    558 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
    559 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
    560 		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
    561 
    562 	rc_remove_instruction(inst);
    563 }
    564 
    565 static void transform_SNE(struct radeon_compiler* c,
    566 	struct rc_instruction* inst)
    567 {
    568 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    569 
    570 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
    571 	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
    572 		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
    573 
    574 	rc_remove_instruction(inst);
    575 }
    576 
    577 static void transform_SSG(struct radeon_compiler* c,
    578 	struct rc_instruction* inst)
    579 {
    580 	/* result = sign(x)
    581 	 *
    582 	 *   CMP tmp0, -x, 1, 0
    583 	 *   CMP tmp1, x, 1, 0
    584 	 *   ADD result, tmp0, -tmp1;
    585 	 */
    586 	struct rc_dst_register dst0;
    587 	unsigned tmp1;
    588 
    589 	/* 0 < x */
    590 	dst0 = try_to_reuse_dst(c, inst);
    591 	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
    592 	      dst0,
    593 	      negate(inst->U.I.SrcReg[0]),
    594 	      builtin_one,
    595 	      builtin_zero);
    596 
    597 	/* x < 0 */
    598 	tmp1 = rc_find_free_temporary(c);
    599 	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
    600 	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
    601 	      inst->U.I.SrcReg[0],
    602 	      builtin_one,
    603 	      builtin_zero);
    604 
    605 	/* Either both are zero, or one of them is one and the other is zero. */
    606 	/* result = tmp0 - tmp1 */
    607 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
    608 	      inst->U.I.DstReg,
    609 	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
    610 	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
    611 
    612 	rc_remove_instruction(inst);
    613 }
    614 
    615 static void transform_SUB(struct radeon_compiler* c,
    616 	struct rc_instruction* inst)
    617 {
    618 	inst->U.I.Opcode = RC_OPCODE_ADD;
    619 	inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
    620 }
    621 
    622 static void transform_SWZ(struct radeon_compiler* c,
    623 	struct rc_instruction* inst)
    624 {
    625 	inst->U.I.Opcode = RC_OPCODE_MOV;
    626 }
    627 
    628 static void transform_XPD(struct radeon_compiler* c,
    629 	struct rc_instruction* inst)
    630 {
    631 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    632 
    633 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
    634 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
    635 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
    636 	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
    637 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
    638 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
    639 		negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
    640 
    641 	rc_remove_instruction(inst);
    642 }
    643 
    644 
    645 /**
    646  * Can be used as a transformation for @ref radeonClauseLocalTransform,
    647  * no userData necessary.
    648  *
    649  * Eliminates the following ALU instructions:
    650  *  ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
    651  * using:
    652  *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
    653  *
    654  * Transforms RSQ to Radeon's native RSQ by explicitly setting
    655  * absolute value.
    656  *
    657  * @note should be applicable to R300 and R500 fragment programs.
    658  */
    659 int radeonTransformALU(
    660 	struct radeon_compiler * c,
    661 	struct rc_instruction* inst,
    662 	void* unused)
    663 {
    664 	switch(inst->U.I.Opcode) {
    665 	case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
    666 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
    667 	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
    668 	case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
    669 	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
    670 	case RC_OPCODE_DST: transform_DST(c, inst); return 1;
    671 	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
    672 	case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
    673 	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
    674 	case RC_OPCODE_POW: transform_POW(c, inst); return 1;
    675 	case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
    676 	case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
    677 	case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
    678 	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
    679 	case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
    680 	case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
    681 	case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
    682 	case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
    683 	case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
    684 	case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
    685 	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
    686 	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
    687 	case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
    688 	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
    689 	default:
    690 		return 0;
    691 	}
    692 }
    693 
    694 
    695 static void transform_r300_vertex_ABS(struct radeon_compiler* c,
    696 	struct rc_instruction* inst)
    697 {
    698 	/* Note: r500 can take absolute values, but r300 cannot. */
    699 	inst->U.I.Opcode = RC_OPCODE_MAX;
    700 	inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
    701 	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
    702 }
    703 
    704 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
    705 	struct rc_instruction* inst)
    706 {
    707 	/* There is no decent CMP available, so let's rig one up.
    708 	 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
    709 	 * The following sequence consumes zero to two temps and two extra slots
    710 	 * (the second temp and the second slot is consumed by transform_LRP),
    711 	 * but should be equivalent:
    712 	 *
    713 	 * SLT tmp0, src0, 0.0
    714 	 * LRP dst, tmp0, src1, src2
    715 	 *
    716 	 * Yes, I know, I'm a mad scientist. ~ C. & M. */
    717 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    718 
    719 	/* SLT tmp0, src0, 0.0 */
    720 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
    721 		dst,
    722 		inst->U.I.SrcReg[0], builtin_zero);
    723 
    724 	/* LRP dst, tmp0, src1, src2 */
    725 	transform_LRP(c,
    726 		emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
    727 		      inst->U.I.DstReg,
    728 		      srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
    729 
    730 	rc_remove_instruction(inst);
    731 }
    732 
    733 static void transform_r300_vertex_DP2(struct radeon_compiler* c,
    734 	struct rc_instruction* inst)
    735 {
    736 	struct rc_instruction *next_inst = inst->Next;
    737 	transform_DP2(c, inst);
    738 	next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
    739 }
    740 
    741 static void transform_r300_vertex_DP3(struct radeon_compiler* c,
    742 	struct rc_instruction* inst)
    743 {
    744 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
    745 	struct rc_src_register src1 = inst->U.I.SrcReg[1];
    746 	src0.Negate &= ~RC_MASK_W;
    747 	src0.Swizzle &= ~(7 << (3 * 3));
    748 	src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
    749 	src1.Negate &= ~RC_MASK_W;
    750 	src1.Swizzle &= ~(7 << (3 * 3));
    751 	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
    752 	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
    753 	rc_remove_instruction(inst);
    754 }
    755 
    756 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
    757 	struct rc_instruction* inst)
    758 {
    759 	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
    760 	unsigned constant_swizzle;
    761 	int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
    762 							 0.0000000000000000001,
    763 							 &constant_swizzle);
    764 
    765 	/* MOV dst, src */
    766 	dst.WriteMask = RC_MASK_XYZW;
    767 	emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
    768 		dst,
    769 		inst->U.I.SrcReg[0]);
    770 
    771 	/* MAX dst.y, src, 0.00...001 */
    772 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
    773 		dstregtmpmask(dst.Index, RC_MASK_Y),
    774 		srcreg(RC_FILE_TEMPORARY, dst.Index),
    775 		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
    776 
    777 	inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
    778 }
    779 
    780 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
    781 	struct rc_instruction *inst)
    782 {
    783 	/* x = y  <==>  x >= y && y >= x */
    784 	int tmp = rc_find_free_temporary(c);
    785 
    786 	/* x <= y */
    787 	emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
    788 	      dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
    789 	      inst->U.I.SrcReg[0],
    790 	      inst->U.I.SrcReg[1]);
    791 
    792 	/* y <= x */
    793 	emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
    794 	      inst->U.I.DstReg,
    795 	      inst->U.I.SrcReg[1],
    796 	      inst->U.I.SrcReg[0]);
    797 
    798 	/* x && y  =  x * y */
    799 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
    800 	      inst->U.I.DstReg,
    801 	      srcreg(RC_FILE_TEMPORARY, tmp),
    802 	      srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
    803 
    804 	rc_remove_instruction(inst);
    805 }
    806 
    807 static void transform_r300_vertex_SNE(struct radeon_compiler *c,
    808 	struct rc_instruction *inst)
    809 {
    810 	/* x != y  <==>  x < y || y < x */
    811 	int tmp = rc_find_free_temporary(c);
    812 
    813 	/* x < y */
    814 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
    815 	      dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
    816 	      inst->U.I.SrcReg[0],
    817 	      inst->U.I.SrcReg[1]);
    818 
    819 	/* y < x */
    820 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
    821 	      inst->U.I.DstReg,
    822 	      inst->U.I.SrcReg[1],
    823 	      inst->U.I.SrcReg[0]);
    824 
    825 	/* x || y  =  max(x, y) */
    826 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
    827 	      inst->U.I.DstReg,
    828 	      srcreg(RC_FILE_TEMPORARY, tmp),
    829 	      srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
    830 
    831 	rc_remove_instruction(inst);
    832 }
    833 
    834 static void transform_r300_vertex_SGT(struct radeon_compiler* c,
    835 	struct rc_instruction* inst)
    836 {
    837 	/* x > y  <==>  -x < -y */
    838 	inst->U.I.Opcode = RC_OPCODE_SLT;
    839 	inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
    840 	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
    841 }
    842 
    843 static void transform_r300_vertex_SLE(struct radeon_compiler* c,
    844 	struct rc_instruction* inst)
    845 {
    846 	/* x <= y  <==>  -x >= -y */
    847 	inst->U.I.Opcode = RC_OPCODE_SGE;
    848 	inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
    849 	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
    850 }
    851 
    852 static void transform_r300_vertex_SSG(struct radeon_compiler* c,
    853 	struct rc_instruction* inst)
    854 {
    855 	/* result = sign(x)
    856 	 *
    857 	 *   SLT tmp0, 0, x;
    858 	 *   SLT tmp1, x, 0;
    859 	 *   ADD result, tmp0, -tmp1;
    860 	 */
    861 	struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
    862 	unsigned tmp1;
    863 
    864 	/* 0 < x */
    865 	dst0 = try_to_reuse_dst(c, inst);
    866 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
    867 	      dst0,
    868 	      builtin_zero,
    869 	      inst->U.I.SrcReg[0]);
    870 
    871 	/* x < 0 */
    872 	tmp1 = rc_find_free_temporary(c);
    873 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
    874 	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
    875 	      inst->U.I.SrcReg[0],
    876 	      builtin_zero);
    877 
    878 	/* Either both are zero, or one of them is one and the other is zero. */
    879 	/* result = tmp0 - tmp1 */
    880 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
    881 	      inst->U.I.DstReg,
    882 	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
    883 	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
    884 
    885 	rc_remove_instruction(inst);
    886 }
    887 
    888 static void transform_vertex_TRUNC(struct radeon_compiler* c,
    889 	struct rc_instruction* inst)
    890 {
    891 	struct rc_instruction *next = inst->Next;
    892 
    893 	/* next->Prev is removed after each transformation and replaced
    894 	 * by a new instruction. */
    895 	transform_TRUNC(c, next->Prev);
    896 	transform_r300_vertex_CMP(c, next->Prev);
    897 }
    898 
    899 /**
    900  * For use with rc_local_transform, this transforms non-native ALU
    901  * instructions of the r300 up to r500 vertex engine.
    902  */
    903 int r300_transform_vertex_alu(
    904 	struct radeon_compiler * c,
    905 	struct rc_instruction* inst,
    906 	void* unused)
    907 {
    908 	switch(inst->U.I.Opcode) {
    909 	case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
    910 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
    911 	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
    912 	case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
    913 	case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
    914 	case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
    915 	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
    916 	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
    917 	case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
    918 	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
    919 	case RC_OPCODE_SEQ:
    920 		if (!c->is_r500) {
    921 			transform_r300_vertex_SEQ(c, inst);
    922 			return 1;
    923 		}
    924 		return 0;
    925 	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
    926 	case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
    927 	case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
    928 	case RC_OPCODE_SNE:
    929 		if (!c->is_r500) {
    930 			transform_r300_vertex_SNE(c, inst);
    931 			return 1;
    932 		}
    933 		return 0;
    934 	case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
    935 	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
    936 	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
    937 	case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
    938 	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
    939 	default:
    940 		return 0;
    941 	}
    942 }
    943 
    944 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
    945 {
    946 	static const float SinCosConsts[2][4] = {
    947 		{
    948 			1.273239545,		/* 4/PI */
    949 			-0.405284735,		/* -4/(PI*PI) */
    950 			3.141592654,		/* PI */
    951 			0.2225			/* weight */
    952 		},
    953 		{
    954 			0.75,
    955 			0.5,
    956 			0.159154943,		/* 1/(2*PI) */
    957 			6.283185307		/* 2*PI */
    958 		}
    959 	};
    960 	int i;
    961 
    962 	for(i = 0; i < 2; ++i)
    963 		constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
    964 }
    965 
    966 /**
    967  * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
    968  *
    969  * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
    970  * MAD tmp.x, tmp.y, |src|, tmp.x
    971  * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
    972  * MAD dest, tmp.y, weight, tmp.x
    973  */
    974 static void sin_approx(
    975 	struct radeon_compiler* c, struct rc_instruction * inst,
    976 	struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
    977 {
    978 	unsigned int tempreg = rc_find_free_temporary(c);
    979 
    980 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
    981 		swizzle_xxxx(src),
    982 		srcreg(RC_FILE_CONSTANT, constants[0]));
    983 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
    984 		swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
    985 		absolute(swizzle_xxxx(src)),
    986 		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
    987 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
    988 		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
    989 		absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
    990 		negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
    991 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
    992 		swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
    993 		swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
    994 		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
    995 }
    996 
    997 /**
    998  * Translate the trigonometric functions COS, SIN, and SCS
    999  * using only the basic instructions
   1000  *  MOV, ADD, MUL, MAD, FRC
   1001  */
   1002 int r300_transform_trig_simple(struct radeon_compiler* c,
   1003 	struct rc_instruction* inst,
   1004 	void* unused)
   1005 {
   1006 	unsigned int constants[2];
   1007 	unsigned int tempreg;
   1008 
   1009 	if (inst->U.I.Opcode != RC_OPCODE_COS &&
   1010 	    inst->U.I.Opcode != RC_OPCODE_SIN &&
   1011 	    inst->U.I.Opcode != RC_OPCODE_SCS)
   1012 		return 0;
   1013 
   1014 	tempreg = rc_find_free_temporary(c);
   1015 
   1016 	sincos_constants(c, constants);
   1017 
   1018 	if (inst->U.I.Opcode == RC_OPCODE_COS) {
   1019 		/* MAD tmp.x, src, 1/(2*PI), 0.75 */
   1020 		/* FRC tmp.x, tmp.x */
   1021 		/* MAD tmp.z, tmp.x, 2*PI, -PI */
   1022 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
   1023 			swizzle_xxxx(inst->U.I.SrcReg[0]),
   1024 			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
   1025 			swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
   1026 		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
   1027 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
   1028 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
   1029 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
   1030 			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
   1031 			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
   1032 
   1033 		sin_approx(c, inst, inst->U.I.DstReg,
   1034 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
   1035 			constants);
   1036 	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
   1037 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
   1038 			swizzle_xxxx(inst->U.I.SrcReg[0]),
   1039 			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
   1040 			swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
   1041 		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
   1042 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
   1043 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
   1044 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
   1045 			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
   1046 			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
   1047 
   1048 		sin_approx(c, inst, inst->U.I.DstReg,
   1049 			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
   1050 			constants);
   1051 	} else {
   1052 		struct rc_dst_register dst;
   1053 
   1054 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
   1055 			swizzle_xxxx(inst->U.I.SrcReg[0]),
   1056 			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
   1057 			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
   1058 		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
   1059 			srcreg(RC_FILE_TEMPORARY, tempreg));
   1060 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
   1061 			srcreg(RC_FILE_TEMPORARY, tempreg),
   1062 			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
   1063 			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
   1064 
   1065 		dst = inst->U.I.DstReg;
   1066 
   1067 		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
   1068 		sin_approx(c, inst, dst,
   1069 			swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
   1070 			constants);
   1071 
   1072 		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
   1073 		sin_approx(c, inst, dst,
   1074 			swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
   1075 			constants);
   1076 	}
   1077 
   1078 	rc_remove_instruction(inst);
   1079 
   1080 	return 1;
   1081 }
   1082 
   1083 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
   1084 	struct rc_instruction *inst,
   1085 	unsigned srctmp)
   1086 {
   1087 	if (inst->U.I.Opcode == RC_OPCODE_COS) {
   1088 		emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
   1089 			srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
   1090 	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
   1091 		emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
   1092 			inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
   1093 	} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
   1094 		struct rc_dst_register moddst = inst->U.I.DstReg;
   1095 
   1096 		if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
   1097 			moddst.WriteMask = RC_MASK_X;
   1098 			emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
   1099 				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
   1100 		}
   1101 		if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
   1102 			moddst.WriteMask = RC_MASK_Y;
   1103 			emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
   1104 				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
   1105 		}
   1106 	}
   1107 
   1108 	rc_remove_instruction(inst);
   1109 }
   1110 
   1111 
   1112 /**
   1113  * Transform the trigonometric functions COS, SIN, and SCS
   1114  * to include pre-scaling by 1/(2*PI) and taking the fractional
   1115  * part, so that the input to COS and SIN is always in the range [0,1).
   1116  * SCS is replaced by one COS and one SIN instruction.
   1117  *
   1118  * @warning This transformation implicitly changes the semantics of SIN and COS!
   1119  */
   1120 int radeonTransformTrigScale(struct radeon_compiler* c,
   1121 	struct rc_instruction* inst,
   1122 	void* unused)
   1123 {
   1124 	static const float RCP_2PI = 0.15915494309189535;
   1125 	unsigned int temp;
   1126 	unsigned int constant;
   1127 	unsigned int constant_swizzle;
   1128 
   1129 	if (inst->U.I.Opcode != RC_OPCODE_COS &&
   1130 	    inst->U.I.Opcode != RC_OPCODE_SIN &&
   1131 	    inst->U.I.Opcode != RC_OPCODE_SCS)
   1132 		return 0;
   1133 
   1134 	temp = rc_find_free_temporary(c);
   1135 	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
   1136 
   1137 	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
   1138 		swizzle_xxxx(inst->U.I.SrcReg[0]),
   1139 		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
   1140 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
   1141 		srcreg(RC_FILE_TEMPORARY, temp));
   1142 
   1143 	r300_transform_SIN_COS_SCS(c, inst, temp);
   1144 	return 1;
   1145 }
   1146 
   1147 /**
   1148  * Transform the trigonometric functions COS, SIN, and SCS
   1149  * so that the input to COS and SIN is always in the range [-PI, PI].
   1150  * SCS is replaced by one COS and one SIN instruction.
   1151  */
   1152 int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
   1153 	struct rc_instruction *inst,
   1154 	void *unused)
   1155 {
   1156 	static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
   1157 	unsigned int temp;
   1158 	unsigned int constant;
   1159 
   1160 	if (inst->U.I.Opcode != RC_OPCODE_COS &&
   1161 	    inst->U.I.Opcode != RC_OPCODE_SIN &&
   1162 	    inst->U.I.Opcode != RC_OPCODE_SCS)
   1163 		return 0;
   1164 
   1165 	/* Repeat x in the range [-PI, PI]:
   1166 	 *
   1167 	 *   repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
   1168 	 */
   1169 
   1170 	temp = rc_find_free_temporary(c);
   1171 	constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
   1172 
   1173 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
   1174 		swizzle_xxxx(inst->U.I.SrcReg[0]),
   1175 		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
   1176 		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
   1177 	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
   1178 		srcreg(RC_FILE_TEMPORARY, temp));
   1179 	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
   1180 		srcreg(RC_FILE_TEMPORARY, temp),
   1181 		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
   1182 		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
   1183 
   1184 	r300_transform_SIN_COS_SCS(c, inst, temp);
   1185 	return 1;
   1186 }
   1187 
   1188 /**
   1189  * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
   1190  * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
   1191  * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
   1192  *
   1193  * @warning This explicitly changes the form of DDX and DDY!
   1194  */
   1195 
   1196 int radeonTransformDeriv(struct radeon_compiler* c,
   1197 	struct rc_instruction* inst,
   1198 	void* unused)
   1199 {
   1200 	if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
   1201 		return 0;
   1202 
   1203 	inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
   1204 	inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
   1205 
   1206 	return 1;
   1207 }
   1208 
   1209 /**
   1210  * IF Temp[0].x -> IF Temp[0].x
   1211  * ...          -> ...
   1212  * KILL         -> KIL -abs(Temp[0].x)
   1213  * ...          -> ...
   1214  * ENDIF        -> ENDIF
   1215  *
   1216  * === OR ===
   1217  *
   1218  * IF Temp[0].x -\
   1219  * KILL         - > KIL -abs(Temp[0].x)
   1220  * ENDIF        -/
   1221  *
   1222  * === OR ===
   1223  *
   1224  * IF Temp[0].x -> IF Temp[0].x
   1225  * ...          -> ...
   1226  * ELSE         -> ELSE
   1227  * ...	        -> ...
   1228  * KILL	        -> KIL -abs(Temp[0].x)
   1229  * ...          -> ...
   1230  * ENDIF        -> ENDIF
   1231  *
   1232  * === OR ===
   1233  *
   1234  * KILL         -> KIL -none.1111
   1235  *
   1236  * This needs to be done in its own pass, because it might modify the
   1237  * instructions before and after KILL.
   1238  */
   1239 void rc_transform_KILL(struct radeon_compiler * c, void *user)
   1240 {
   1241 	struct rc_instruction * inst;
   1242 	for (inst = c->Program.Instructions.Next;
   1243 			inst != &c->Program.Instructions; inst = inst->Next) {
   1244 		struct rc_instruction * if_inst;
   1245 		unsigned in_if = 0;
   1246 
   1247 		if (inst->U.I.Opcode != RC_OPCODE_KILP)
   1248 			continue;
   1249 
   1250 		for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
   1251 						if_inst = if_inst->Prev) {
   1252 
   1253 			if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
   1254 				in_if = 1;
   1255 				break;
   1256 			}
   1257 		}
   1258 
   1259 		inst->U.I.Opcode = RC_OPCODE_KIL;
   1260 
   1261 		if (!in_if) {
   1262 			inst->U.I.SrcReg[0] = negate(builtin_one);
   1263 		} else {
   1264 			/* This should work even if the KILP is inside the ELSE
   1265 			 * block, because -0.0 is considered negative. */
   1266 			inst->U.I.SrcReg[0] =
   1267 				negate(absolute(if_inst->U.I.SrcReg[0]));
   1268 
   1269 			if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
   1270 				&& inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
   1271 
   1272 				/* Optimize the special case:
   1273 				 * IF Temp[0].x
   1274 				 * KILP
   1275 				 * ENDIF
   1276 				 */
   1277 
   1278 				/* Remove IF */
   1279 				rc_remove_instruction(inst->Prev);
   1280 				/* Remove ENDIF */
   1281 				rc_remove_instruction(inst->Next);
   1282 			}
   1283 		}
   1284 	}
   1285 }
   1286 
   1287 int rc_force_output_alpha_to_one(struct radeon_compiler *c,
   1288 				 struct rc_instruction *inst, void *data)
   1289 {
   1290 	struct r300_fragment_program_compiler *fragc = (struct r300_fragment_program_compiler*)c;
   1291 	const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
   1292 	unsigned tmp;
   1293 
   1294 	if (!info->HasDstReg || inst->U.I.DstReg.File != RC_FILE_OUTPUT ||
   1295 	    inst->U.I.DstReg.Index == fragc->OutputDepth)
   1296 		return 1;
   1297 
   1298 	tmp = rc_find_free_temporary(c);
   1299 
   1300 	/* Insert MOV after inst, set alpha to 1. */
   1301 	emit1(c, inst, RC_OPCODE_MOV, 0, inst->U.I.DstReg,
   1302 	      srcregswz(RC_FILE_TEMPORARY, tmp, RC_SWIZZLE_XYZ1));
   1303 
   1304 	/* Re-route the destination of inst to the source of mov. */
   1305 	inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
   1306 	inst->U.I.DstReg.Index = tmp;
   1307 
   1308 	/* Move the saturate output modifier to the MOV instruction
   1309 	 * (for better copy propagation). */
   1310 	inst->Next->U.I.SaturateMode = inst->U.I.SaturateMode;
   1311 	inst->U.I.SaturateMode = RC_SATURATE_NONE;
   1312 	return 1;
   1313 }
   1314