1 /* 2 * Copyright (C) 2008 Nicolai Haehnle. 3 * 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining 7 * a copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sublicense, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial 16 * portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE 22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 */ 27 28 /** 29 * @file 30 * 31 * Shareable transformations that transform "special" ALU instructions 32 * into ALU instructions that are supported by hardware. 33 * 34 */ 35 36 #include "radeon_program_alu.h" 37 38 #include "radeon_compiler.h" 39 #include "radeon_compiler_util.h" 40 41 42 static struct rc_instruction *emit1( 43 struct radeon_compiler * c, struct rc_instruction * after, 44 rc_opcode Opcode, struct rc_sub_instruction * base, 45 struct rc_dst_register DstReg, struct rc_src_register SrcReg) 46 { 47 struct rc_instruction *fpi = rc_insert_new_instruction(c, after); 48 49 if (base) { 50 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction)); 51 } 52 53 fpi->U.I.Opcode = Opcode; 54 fpi->U.I.DstReg = DstReg; 55 fpi->U.I.SrcReg[0] = SrcReg; 56 return fpi; 57 } 58 59 static struct rc_instruction *emit2( 60 struct radeon_compiler * c, struct rc_instruction * after, 61 rc_opcode Opcode, struct rc_sub_instruction * base, 62 struct rc_dst_register DstReg, 63 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1) 64 { 65 struct rc_instruction *fpi = rc_insert_new_instruction(c, after); 66 67 if (base) { 68 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction)); 69 } 70 71 fpi->U.I.Opcode = Opcode; 72 fpi->U.I.DstReg = DstReg; 73 fpi->U.I.SrcReg[0] = SrcReg0; 74 fpi->U.I.SrcReg[1] = SrcReg1; 75 return fpi; 76 } 77 78 static struct rc_instruction *emit3( 79 struct radeon_compiler * c, struct rc_instruction * after, 80 rc_opcode Opcode, struct rc_sub_instruction * base, 81 struct rc_dst_register DstReg, 82 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1, 83 struct rc_src_register SrcReg2) 84 { 85 struct rc_instruction *fpi = rc_insert_new_instruction(c, after); 86 87 if (base) { 88 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction)); 89 } 90 91 fpi->U.I.Opcode = Opcode; 92 fpi->U.I.DstReg = DstReg; 93 fpi->U.I.SrcReg[0] = SrcReg0; 94 fpi->U.I.SrcReg[1] = SrcReg1; 95 fpi->U.I.SrcReg[2] = SrcReg2; 96 return fpi; 97 } 98 99 static struct rc_dst_register dstregtmpmask(int index, int mask) 100 { 101 struct rc_dst_register dst = {0, 0, 0}; 102 dst.File = RC_FILE_TEMPORARY; 103 dst.Index = index; 104 dst.WriteMask = mask; 105 return dst; 106 } 107 108 static const struct rc_src_register builtin_zero = { 109 .File = RC_FILE_NONE, 110 .Index = 0, 111 .Swizzle = RC_SWIZZLE_0000 112 }; 113 static const struct rc_src_register builtin_one = { 114 .File = RC_FILE_NONE, 115 .Index = 0, 116 .Swizzle = RC_SWIZZLE_1111 117 }; 118 119 static const struct rc_src_register builtin_half = { 120 .File = RC_FILE_NONE, 121 .Index = 0, 122 .Swizzle = RC_SWIZZLE_HHHH 123 }; 124 125 static const struct rc_src_register srcreg_undefined = { 126 .File = RC_FILE_NONE, 127 .Index = 0, 128 .Swizzle = RC_SWIZZLE_XYZW 129 }; 130 131 static struct rc_src_register srcreg(int file, int index) 132 { 133 struct rc_src_register src = srcreg_undefined; 134 src.File = file; 135 src.Index = index; 136 return src; 137 } 138 139 static struct rc_src_register srcregswz(int file, int index, int swz) 140 { 141 struct rc_src_register src = srcreg_undefined; 142 src.File = file; 143 src.Index = index; 144 src.Swizzle = swz; 145 return src; 146 } 147 148 static struct rc_src_register absolute(struct rc_src_register reg) 149 { 150 struct rc_src_register newreg = reg; 151 newreg.Abs = 1; 152 newreg.Negate = RC_MASK_NONE; 153 return newreg; 154 } 155 156 static struct rc_src_register negate(struct rc_src_register reg) 157 { 158 struct rc_src_register newreg = reg; 159 newreg.Negate = newreg.Negate ^ RC_MASK_XYZW; 160 return newreg; 161 } 162 163 static struct rc_src_register swizzle(struct rc_src_register reg, 164 rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w) 165 { 166 struct rc_src_register swizzled = reg; 167 swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w); 168 return swizzled; 169 } 170 171 static struct rc_src_register swizzle_smear(struct rc_src_register reg, 172 rc_swizzle x) 173 { 174 return swizzle(reg, x, x, x, x); 175 } 176 177 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg) 178 { 179 return swizzle_smear(reg, RC_SWIZZLE_X); 180 } 181 182 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg) 183 { 184 return swizzle_smear(reg, RC_SWIZZLE_Y); 185 } 186 187 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg) 188 { 189 return swizzle_smear(reg, RC_SWIZZLE_Z); 190 } 191 192 static struct rc_src_register swizzle_wwww(struct rc_src_register reg) 193 { 194 return swizzle_smear(reg, RC_SWIZZLE_W); 195 } 196 197 static int is_dst_safe_to_reuse(struct rc_instruction *inst) 198 { 199 const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode); 200 unsigned i; 201 202 assert(info->HasDstReg); 203 204 if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY) 205 return 0; 206 207 for (i = 0; i < info->NumSrcRegs; i++) { 208 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY && 209 inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index) 210 return 0; 211 } 212 213 return 1; 214 } 215 216 static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c, 217 struct rc_instruction *inst) 218 { 219 unsigned tmp; 220 221 if (is_dst_safe_to_reuse(inst)) 222 tmp = inst->U.I.DstReg.Index; 223 else 224 tmp = rc_find_free_temporary(c); 225 226 return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask); 227 } 228 229 static void transform_ABS(struct radeon_compiler* c, 230 struct rc_instruction* inst) 231 { 232 struct rc_src_register src = inst->U.I.SrcReg[0]; 233 src.Abs = 1; 234 src.Negate = RC_MASK_NONE; 235 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src); 236 rc_remove_instruction(inst); 237 } 238 239 static void transform_CEIL(struct radeon_compiler* c, 240 struct rc_instruction* inst) 241 { 242 /* Assuming: 243 * ceil(x) = -floor(-x) 244 * 245 * After inlining floor: 246 * ceil(x) = -(-x-frac(-x)) 247 * 248 * After simplification: 249 * ceil(x) = x+frac(-x) 250 */ 251 252 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 253 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0])); 254 emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg, 255 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index)); 256 rc_remove_instruction(inst); 257 } 258 259 static void transform_CLAMP(struct radeon_compiler *c, 260 struct rc_instruction *inst) 261 { 262 /* CLAMP dst, src, min, max 263 * into: 264 * MIN tmp, src, max 265 * MAX dst, tmp, min 266 */ 267 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 268 emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst, 269 inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]); 270 emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg, 271 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]); 272 rc_remove_instruction(inst); 273 } 274 275 static void transform_DP2(struct radeon_compiler* c, 276 struct rc_instruction* inst) 277 { 278 struct rc_src_register src0 = inst->U.I.SrcReg[0]; 279 struct rc_src_register src1 = inst->U.I.SrcReg[1]; 280 src0.Negate &= ~(RC_MASK_Z | RC_MASK_W); 281 src0.Swizzle &= ~(63 << (3 * 2)); 282 src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3)); 283 src1.Negate &= ~(RC_MASK_Z | RC_MASK_W); 284 src1.Swizzle &= ~(63 << (3 * 2)); 285 src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3)); 286 emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1); 287 rc_remove_instruction(inst); 288 } 289 290 static void transform_DPH(struct radeon_compiler* c, 291 struct rc_instruction* inst) 292 { 293 struct rc_src_register src0 = inst->U.I.SrcReg[0]; 294 src0.Negate &= ~RC_MASK_W; 295 src0.Swizzle &= ~(7 << (3 * 3)); 296 src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3); 297 emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]); 298 rc_remove_instruction(inst); 299 } 300 301 /** 302 * [1, src0.y*src1.y, src0.z, src1.w] 303 * So basically MUL with lotsa swizzling. 304 */ 305 static void transform_DST(struct radeon_compiler* c, 306 struct rc_instruction* inst) 307 { 308 emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg, 309 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE), 310 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W)); 311 rc_remove_instruction(inst); 312 } 313 314 static void transform_FLR(struct radeon_compiler* c, 315 struct rc_instruction* inst) 316 { 317 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 318 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]); 319 emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg, 320 inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index))); 321 rc_remove_instruction(inst); 322 } 323 324 static void transform_TRUNC(struct radeon_compiler* c, 325 struct rc_instruction* inst) 326 { 327 /* Definition of trunc: 328 * trunc(x) = (abs(x) - fract(abs(x))) * sgn(x) 329 * 330 * The multiplication by sgn(x) can be simplified using CMP: 331 * y * sgn(x) = (x < 0 ? -y : y) 332 */ 333 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 334 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0])); 335 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]), 336 negate(srcreg(RC_FILE_TEMPORARY, dst.Index))); 337 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0], 338 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index)); 339 rc_remove_instruction(inst); 340 } 341 342 /** 343 * Definition of LIT (from ARB_fragment_program): 344 * 345 * tmp = VectorLoad(op0); 346 * if (tmp.x < 0) tmp.x = 0; 347 * if (tmp.y < 0) tmp.y = 0; 348 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon); 349 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon; 350 * result.x = 1.0; 351 * result.y = tmp.x; 352 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0; 353 * result.w = 1.0; 354 * 355 * The longest path of computation is the one leading to result.z, 356 * consisting of 5 operations. This implementation of LIT takes 357 * 5 slots, if the subsequent optimization passes are clever enough 358 * to pair instructions correctly. 359 */ 360 static void transform_LIT(struct radeon_compiler* c, 361 struct rc_instruction* inst) 362 { 363 unsigned int constant; 364 unsigned int constant_swizzle; 365 unsigned int temp; 366 struct rc_src_register srctemp; 367 368 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle); 369 370 if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) { 371 struct rc_instruction * inst_mov; 372 373 inst_mov = emit1(c, inst, 374 RC_OPCODE_MOV, 0, inst->U.I.DstReg, 375 srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c))); 376 377 inst->U.I.DstReg.File = RC_FILE_TEMPORARY; 378 inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index; 379 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW; 380 } 381 382 temp = inst->U.I.DstReg.Index; 383 srctemp = srcreg(RC_FILE_TEMPORARY, temp); 384 385 /* tmp.x = max(0.0, Src.x); */ 386 /* tmp.y = max(0.0, Src.y); */ 387 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */ 388 emit2(c, inst->Prev, RC_OPCODE_MAX, 0, 389 dstregtmpmask(temp, RC_MASK_XYW), 390 inst->U.I.SrcReg[0], 391 swizzle(srcreg(RC_FILE_CONSTANT, constant), 392 RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3)); 393 emit2(c, inst->Prev, RC_OPCODE_MIN, 0, 394 dstregtmpmask(temp, RC_MASK_Z), 395 swizzle_wwww(srctemp), 396 negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle))); 397 398 /* tmp.w = Pow(tmp.y, tmp.w) */ 399 emit1(c, inst->Prev, RC_OPCODE_LG2, 0, 400 dstregtmpmask(temp, RC_MASK_W), 401 swizzle_yyyy(srctemp)); 402 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, 403 dstregtmpmask(temp, RC_MASK_W), 404 swizzle_wwww(srctemp), 405 swizzle_zzzz(srctemp)); 406 emit1(c, inst->Prev, RC_OPCODE_EX2, 0, 407 dstregtmpmask(temp, RC_MASK_W), 408 swizzle_wwww(srctemp)); 409 410 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */ 411 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, 412 dstregtmpmask(temp, RC_MASK_Z), 413 negate(swizzle_xxxx(srctemp)), 414 swizzle_wwww(srctemp), 415 builtin_zero); 416 417 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */ 418 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, 419 dstregtmpmask(temp, RC_MASK_XYW), 420 swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE)); 421 422 rc_remove_instruction(inst); 423 } 424 425 static void transform_LRP(struct radeon_compiler* c, 426 struct rc_instruction* inst) 427 { 428 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 429 430 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, 431 dst, 432 inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2])); 433 emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, 434 inst->U.I.DstReg, 435 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]); 436 437 rc_remove_instruction(inst); 438 } 439 440 static void transform_POW(struct radeon_compiler* c, 441 struct rc_instruction* inst) 442 { 443 struct rc_dst_register tempdst = try_to_reuse_dst(c, inst); 444 struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index); 445 tempdst.WriteMask = RC_MASK_W; 446 tempsrc.Swizzle = RC_SWIZZLE_WWWW; 447 448 emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0])); 449 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1])); 450 emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc); 451 452 rc_remove_instruction(inst); 453 } 454 455 /* dst = ROUND(src) : 456 * add = src + .5 457 * frac = FRC(add) 458 * dst = add - frac 459 * 460 * According to the GLSL spec, the implementor can decide which way to round 461 * when the fraction is .5. We round down for .5. 462 * 463 */ 464 static void transform_ROUND(struct radeon_compiler* c, 465 struct rc_instruction* inst) 466 { 467 unsigned int mask = inst->U.I.DstReg.WriteMask; 468 unsigned int frac_index, add_index; 469 struct rc_dst_register frac_dst, add_dst; 470 struct rc_src_register frac_src, add_src; 471 472 /* add = src + .5 */ 473 add_index = rc_find_free_temporary(c); 474 add_dst = dstregtmpmask(add_index, mask); 475 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0], 476 builtin_half); 477 add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index); 478 479 480 /* frac = FRC(add) */ 481 frac_index = rc_find_free_temporary(c); 482 frac_dst = dstregtmpmask(frac_index, mask); 483 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src); 484 frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index); 485 486 /* dst = add - frac */ 487 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg, 488 add_src, negate(frac_src)); 489 rc_remove_instruction(inst); 490 } 491 492 static void transform_RSQ(struct radeon_compiler* c, 493 struct rc_instruction* inst) 494 { 495 inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]); 496 } 497 498 static void transform_SEQ(struct radeon_compiler* c, 499 struct rc_instruction* inst) 500 { 501 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 502 503 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1])); 504 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, 505 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one); 506 507 rc_remove_instruction(inst); 508 } 509 510 static void transform_SFL(struct radeon_compiler* c, 511 struct rc_instruction* inst) 512 { 513 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero); 514 rc_remove_instruction(inst); 515 } 516 517 static void transform_SGE(struct radeon_compiler* c, 518 struct rc_instruction* inst) 519 { 520 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 521 522 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1])); 523 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, 524 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one); 525 526 rc_remove_instruction(inst); 527 } 528 529 static void transform_SGT(struct radeon_compiler* c, 530 struct rc_instruction* inst) 531 { 532 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 533 534 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]); 535 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, 536 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero); 537 538 rc_remove_instruction(inst); 539 } 540 541 static void transform_SLE(struct radeon_compiler* c, 542 struct rc_instruction* inst) 543 { 544 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 545 546 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]); 547 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, 548 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one); 549 550 rc_remove_instruction(inst); 551 } 552 553 static void transform_SLT(struct radeon_compiler* c, 554 struct rc_instruction* inst) 555 { 556 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 557 558 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1])); 559 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, 560 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero); 561 562 rc_remove_instruction(inst); 563 } 564 565 static void transform_SNE(struct radeon_compiler* c, 566 struct rc_instruction* inst) 567 { 568 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 569 570 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1])); 571 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, 572 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero); 573 574 rc_remove_instruction(inst); 575 } 576 577 static void transform_SSG(struct radeon_compiler* c, 578 struct rc_instruction* inst) 579 { 580 /* result = sign(x) 581 * 582 * CMP tmp0, -x, 1, 0 583 * CMP tmp1, x, 1, 0 584 * ADD result, tmp0, -tmp1; 585 */ 586 struct rc_dst_register dst0; 587 unsigned tmp1; 588 589 /* 0 < x */ 590 dst0 = try_to_reuse_dst(c, inst); 591 emit3(c, inst->Prev, RC_OPCODE_CMP, 0, 592 dst0, 593 negate(inst->U.I.SrcReg[0]), 594 builtin_one, 595 builtin_zero); 596 597 /* x < 0 */ 598 tmp1 = rc_find_free_temporary(c); 599 emit3(c, inst->Prev, RC_OPCODE_CMP, 0, 600 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask), 601 inst->U.I.SrcReg[0], 602 builtin_one, 603 builtin_zero); 604 605 /* Either both are zero, or one of them is one and the other is zero. */ 606 /* result = tmp0 - tmp1 */ 607 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, 608 inst->U.I.DstReg, 609 srcreg(RC_FILE_TEMPORARY, dst0.Index), 610 negate(srcreg(RC_FILE_TEMPORARY, tmp1))); 611 612 rc_remove_instruction(inst); 613 } 614 615 static void transform_SUB(struct radeon_compiler* c, 616 struct rc_instruction* inst) 617 { 618 inst->U.I.Opcode = RC_OPCODE_ADD; 619 inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]); 620 } 621 622 static void transform_SWZ(struct radeon_compiler* c, 623 struct rc_instruction* inst) 624 { 625 inst->U.I.Opcode = RC_OPCODE_MOV; 626 } 627 628 static void transform_XPD(struct radeon_compiler* c, 629 struct rc_instruction* inst) 630 { 631 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 632 633 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst, 634 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W), 635 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W)); 636 emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg, 637 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W), 638 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W), 639 negate(srcreg(RC_FILE_TEMPORARY, dst.Index))); 640 641 rc_remove_instruction(inst); 642 } 643 644 645 /** 646 * Can be used as a transformation for @ref radeonClauseLocalTransform, 647 * no userData necessary. 648 * 649 * Eliminates the following ALU instructions: 650 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD 651 * using: 652 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP 653 * 654 * Transforms RSQ to Radeon's native RSQ by explicitly setting 655 * absolute value. 656 * 657 * @note should be applicable to R300 and R500 fragment programs. 658 */ 659 int radeonTransformALU( 660 struct radeon_compiler * c, 661 struct rc_instruction* inst, 662 void* unused) 663 { 664 switch(inst->U.I.Opcode) { 665 case RC_OPCODE_ABS: transform_ABS(c, inst); return 1; 666 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1; 667 case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1; 668 case RC_OPCODE_DP2: transform_DP2(c, inst); return 1; 669 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1; 670 case RC_OPCODE_DST: transform_DST(c, inst); return 1; 671 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1; 672 case RC_OPCODE_LIT: transform_LIT(c, inst); return 1; 673 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1; 674 case RC_OPCODE_POW: transform_POW(c, inst); return 1; 675 case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1; 676 case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1; 677 case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1; 678 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1; 679 case RC_OPCODE_SGE: transform_SGE(c, inst); return 1; 680 case RC_OPCODE_SGT: transform_SGT(c, inst); return 1; 681 case RC_OPCODE_SLE: transform_SLE(c, inst); return 1; 682 case RC_OPCODE_SLT: transform_SLT(c, inst); return 1; 683 case RC_OPCODE_SNE: transform_SNE(c, inst); return 1; 684 case RC_OPCODE_SSG: transform_SSG(c, inst); return 1; 685 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1; 686 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1; 687 case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1; 688 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1; 689 default: 690 return 0; 691 } 692 } 693 694 695 static void transform_r300_vertex_ABS(struct radeon_compiler* c, 696 struct rc_instruction* inst) 697 { 698 /* Note: r500 can take absolute values, but r300 cannot. */ 699 inst->U.I.Opcode = RC_OPCODE_MAX; 700 inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0]; 701 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; 702 } 703 704 static void transform_r300_vertex_CMP(struct radeon_compiler* c, 705 struct rc_instruction* inst) 706 { 707 /* There is no decent CMP available, so let's rig one up. 708 * CMP is defined as dst = src0 < 0.0 ? src1 : src2 709 * The following sequence consumes zero to two temps and two extra slots 710 * (the second temp and the second slot is consumed by transform_LRP), 711 * but should be equivalent: 712 * 713 * SLT tmp0, src0, 0.0 714 * LRP dst, tmp0, src1, src2 715 * 716 * Yes, I know, I'm a mad scientist. ~ C. & M. */ 717 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 718 719 /* SLT tmp0, src0, 0.0 */ 720 emit2(c, inst->Prev, RC_OPCODE_SLT, 0, 721 dst, 722 inst->U.I.SrcReg[0], builtin_zero); 723 724 /* LRP dst, tmp0, src1, src2 */ 725 transform_LRP(c, 726 emit3(c, inst->Prev, RC_OPCODE_LRP, 0, 727 inst->U.I.DstReg, 728 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])); 729 730 rc_remove_instruction(inst); 731 } 732 733 static void transform_r300_vertex_DP2(struct radeon_compiler* c, 734 struct rc_instruction* inst) 735 { 736 struct rc_instruction *next_inst = inst->Next; 737 transform_DP2(c, inst); 738 next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4; 739 } 740 741 static void transform_r300_vertex_DP3(struct radeon_compiler* c, 742 struct rc_instruction* inst) 743 { 744 struct rc_src_register src0 = inst->U.I.SrcReg[0]; 745 struct rc_src_register src1 = inst->U.I.SrcReg[1]; 746 src0.Negate &= ~RC_MASK_W; 747 src0.Swizzle &= ~(7 << (3 * 3)); 748 src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3); 749 src1.Negate &= ~RC_MASK_W; 750 src1.Swizzle &= ~(7 << (3 * 3)); 751 src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3); 752 emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1); 753 rc_remove_instruction(inst); 754 } 755 756 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c, 757 struct rc_instruction* inst) 758 { 759 struct rc_dst_register dst = try_to_reuse_dst(c, inst); 760 unsigned constant_swizzle; 761 int constant = rc_constants_add_immediate_scalar(&c->Program.Constants, 762 0.0000000000000000001, 763 &constant_swizzle); 764 765 /* MOV dst, src */ 766 dst.WriteMask = RC_MASK_XYZW; 767 emit1(c, inst->Prev, RC_OPCODE_MOV, 0, 768 dst, 769 inst->U.I.SrcReg[0]); 770 771 /* MAX dst.y, src, 0.00...001 */ 772 emit2(c, inst->Prev, RC_OPCODE_MAX, 0, 773 dstregtmpmask(dst.Index, RC_MASK_Y), 774 srcreg(RC_FILE_TEMPORARY, dst.Index), 775 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)); 776 777 inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index); 778 } 779 780 static void transform_r300_vertex_SEQ(struct radeon_compiler *c, 781 struct rc_instruction *inst) 782 { 783 /* x = y <==> x >= y && y >= x */ 784 int tmp = rc_find_free_temporary(c); 785 786 /* x <= y */ 787 emit2(c, inst->Prev, RC_OPCODE_SGE, 0, 788 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask), 789 inst->U.I.SrcReg[0], 790 inst->U.I.SrcReg[1]); 791 792 /* y <= x */ 793 emit2(c, inst->Prev, RC_OPCODE_SGE, 0, 794 inst->U.I.DstReg, 795 inst->U.I.SrcReg[1], 796 inst->U.I.SrcReg[0]); 797 798 /* x && y = x * y */ 799 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, 800 inst->U.I.DstReg, 801 srcreg(RC_FILE_TEMPORARY, tmp), 802 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index)); 803 804 rc_remove_instruction(inst); 805 } 806 807 static void transform_r300_vertex_SNE(struct radeon_compiler *c, 808 struct rc_instruction *inst) 809 { 810 /* x != y <==> x < y || y < x */ 811 int tmp = rc_find_free_temporary(c); 812 813 /* x < y */ 814 emit2(c, inst->Prev, RC_OPCODE_SLT, 0, 815 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask), 816 inst->U.I.SrcReg[0], 817 inst->U.I.SrcReg[1]); 818 819 /* y < x */ 820 emit2(c, inst->Prev, RC_OPCODE_SLT, 0, 821 inst->U.I.DstReg, 822 inst->U.I.SrcReg[1], 823 inst->U.I.SrcReg[0]); 824 825 /* x || y = max(x, y) */ 826 emit2(c, inst->Prev, RC_OPCODE_MAX, 0, 827 inst->U.I.DstReg, 828 srcreg(RC_FILE_TEMPORARY, tmp), 829 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index)); 830 831 rc_remove_instruction(inst); 832 } 833 834 static void transform_r300_vertex_SGT(struct radeon_compiler* c, 835 struct rc_instruction* inst) 836 { 837 /* x > y <==> -x < -y */ 838 inst->U.I.Opcode = RC_OPCODE_SLT; 839 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW; 840 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; 841 } 842 843 static void transform_r300_vertex_SLE(struct radeon_compiler* c, 844 struct rc_instruction* inst) 845 { 846 /* x <= y <==> -x >= -y */ 847 inst->U.I.Opcode = RC_OPCODE_SGE; 848 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW; 849 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; 850 } 851 852 static void transform_r300_vertex_SSG(struct radeon_compiler* c, 853 struct rc_instruction* inst) 854 { 855 /* result = sign(x) 856 * 857 * SLT tmp0, 0, x; 858 * SLT tmp1, x, 0; 859 * ADD result, tmp0, -tmp1; 860 */ 861 struct rc_dst_register dst0 = try_to_reuse_dst(c, inst); 862 unsigned tmp1; 863 864 /* 0 < x */ 865 dst0 = try_to_reuse_dst(c, inst); 866 emit2(c, inst->Prev, RC_OPCODE_SLT, 0, 867 dst0, 868 builtin_zero, 869 inst->U.I.SrcReg[0]); 870 871 /* x < 0 */ 872 tmp1 = rc_find_free_temporary(c); 873 emit2(c, inst->Prev, RC_OPCODE_SLT, 0, 874 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask), 875 inst->U.I.SrcReg[0], 876 builtin_zero); 877 878 /* Either both are zero, or one of them is one and the other is zero. */ 879 /* result = tmp0 - tmp1 */ 880 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, 881 inst->U.I.DstReg, 882 srcreg(RC_FILE_TEMPORARY, dst0.Index), 883 negate(srcreg(RC_FILE_TEMPORARY, tmp1))); 884 885 rc_remove_instruction(inst); 886 } 887 888 static void transform_vertex_TRUNC(struct radeon_compiler* c, 889 struct rc_instruction* inst) 890 { 891 struct rc_instruction *next = inst->Next; 892 893 /* next->Prev is removed after each transformation and replaced 894 * by a new instruction. */ 895 transform_TRUNC(c, next->Prev); 896 transform_r300_vertex_CMP(c, next->Prev); 897 } 898 899 /** 900 * For use with rc_local_transform, this transforms non-native ALU 901 * instructions of the r300 up to r500 vertex engine. 902 */ 903 int r300_transform_vertex_alu( 904 struct radeon_compiler * c, 905 struct rc_instruction* inst, 906 void* unused) 907 { 908 switch(inst->U.I.Opcode) { 909 case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1; 910 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1; 911 case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1; 912 case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1; 913 case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1; 914 case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1; 915 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1; 916 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1; 917 case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1; 918 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1; 919 case RC_OPCODE_SEQ: 920 if (!c->is_r500) { 921 transform_r300_vertex_SEQ(c, inst); 922 return 1; 923 } 924 return 0; 925 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1; 926 case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1; 927 case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1; 928 case RC_OPCODE_SNE: 929 if (!c->is_r500) { 930 transform_r300_vertex_SNE(c, inst); 931 return 1; 932 } 933 return 0; 934 case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1; 935 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1; 936 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1; 937 case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1; 938 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1; 939 default: 940 return 0; 941 } 942 } 943 944 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants) 945 { 946 static const float SinCosConsts[2][4] = { 947 { 948 1.273239545, /* 4/PI */ 949 -0.405284735, /* -4/(PI*PI) */ 950 3.141592654, /* PI */ 951 0.2225 /* weight */ 952 }, 953 { 954 0.75, 955 0.5, 956 0.159154943, /* 1/(2*PI) */ 957 6.283185307 /* 2*PI */ 958 } 959 }; 960 int i; 961 962 for(i = 0; i < 2; ++i) 963 constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]); 964 } 965 966 /** 967 * Approximate sin(x), where x is clamped to (-pi/2, pi/2). 968 * 969 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) } 970 * MAD tmp.x, tmp.y, |src|, tmp.x 971 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x 972 * MAD dest, tmp.y, weight, tmp.x 973 */ 974 static void sin_approx( 975 struct radeon_compiler* c, struct rc_instruction * inst, 976 struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants) 977 { 978 unsigned int tempreg = rc_find_free_temporary(c); 979 980 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY), 981 swizzle_xxxx(src), 982 srcreg(RC_FILE_CONSTANT, constants[0])); 983 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X), 984 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)), 985 absolute(swizzle_xxxx(src)), 986 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))); 987 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y), 988 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)), 989 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))), 990 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)))); 991 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst, 992 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)), 993 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])), 994 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))); 995 } 996 997 /** 998 * Translate the trigonometric functions COS, SIN, and SCS 999 * using only the basic instructions 1000 * MOV, ADD, MUL, MAD, FRC 1001 */ 1002 int r300_transform_trig_simple(struct radeon_compiler* c, 1003 struct rc_instruction* inst, 1004 void* unused) 1005 { 1006 unsigned int constants[2]; 1007 unsigned int tempreg; 1008 1009 if (inst->U.I.Opcode != RC_OPCODE_COS && 1010 inst->U.I.Opcode != RC_OPCODE_SIN && 1011 inst->U.I.Opcode != RC_OPCODE_SCS) 1012 return 0; 1013 1014 tempreg = rc_find_free_temporary(c); 1015 1016 sincos_constants(c, constants); 1017 1018 if (inst->U.I.Opcode == RC_OPCODE_COS) { 1019 /* MAD tmp.x, src, 1/(2*PI), 0.75 */ 1020 /* FRC tmp.x, tmp.x */ 1021 /* MAD tmp.z, tmp.x, 2*PI, -PI */ 1022 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W), 1023 swizzle_xxxx(inst->U.I.SrcReg[0]), 1024 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])), 1025 swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1]))); 1026 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W), 1027 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg))); 1028 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W), 1029 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)), 1030 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])), 1031 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0])))); 1032 1033 sin_approx(c, inst, inst->U.I.DstReg, 1034 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)), 1035 constants); 1036 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) { 1037 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W), 1038 swizzle_xxxx(inst->U.I.SrcReg[0]), 1039 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])), 1040 swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1]))); 1041 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W), 1042 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg))); 1043 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W), 1044 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)), 1045 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])), 1046 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0])))); 1047 1048 sin_approx(c, inst, inst->U.I.DstReg, 1049 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)), 1050 constants); 1051 } else { 1052 struct rc_dst_register dst; 1053 1054 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY), 1055 swizzle_xxxx(inst->U.I.SrcReg[0]), 1056 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])), 1057 swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W)); 1058 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY), 1059 srcreg(RC_FILE_TEMPORARY, tempreg)); 1060 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY), 1061 srcreg(RC_FILE_TEMPORARY, tempreg), 1062 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])), 1063 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0])))); 1064 1065 dst = inst->U.I.DstReg; 1066 1067 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X; 1068 sin_approx(c, inst, dst, 1069 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)), 1070 constants); 1071 1072 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y; 1073 sin_approx(c, inst, dst, 1074 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)), 1075 constants); 1076 } 1077 1078 rc_remove_instruction(inst); 1079 1080 return 1; 1081 } 1082 1083 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c, 1084 struct rc_instruction *inst, 1085 unsigned srctmp) 1086 { 1087 if (inst->U.I.Opcode == RC_OPCODE_COS) { 1088 emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg, 1089 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW)); 1090 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) { 1091 emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, 1092 inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW)); 1093 } else if (inst->U.I.Opcode == RC_OPCODE_SCS) { 1094 struct rc_dst_register moddst = inst->U.I.DstReg; 1095 1096 if (inst->U.I.DstReg.WriteMask & RC_MASK_X) { 1097 moddst.WriteMask = RC_MASK_X; 1098 emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst, 1099 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW)); 1100 } 1101 if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) { 1102 moddst.WriteMask = RC_MASK_Y; 1103 emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst, 1104 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW)); 1105 } 1106 } 1107 1108 rc_remove_instruction(inst); 1109 } 1110 1111 1112 /** 1113 * Transform the trigonometric functions COS, SIN, and SCS 1114 * to include pre-scaling by 1/(2*PI) and taking the fractional 1115 * part, so that the input to COS and SIN is always in the range [0,1). 1116 * SCS is replaced by one COS and one SIN instruction. 1117 * 1118 * @warning This transformation implicitly changes the semantics of SIN and COS! 1119 */ 1120 int radeonTransformTrigScale(struct radeon_compiler* c, 1121 struct rc_instruction* inst, 1122 void* unused) 1123 { 1124 static const float RCP_2PI = 0.15915494309189535; 1125 unsigned int temp; 1126 unsigned int constant; 1127 unsigned int constant_swizzle; 1128 1129 if (inst->U.I.Opcode != RC_OPCODE_COS && 1130 inst->U.I.Opcode != RC_OPCODE_SIN && 1131 inst->U.I.Opcode != RC_OPCODE_SCS) 1132 return 0; 1133 1134 temp = rc_find_free_temporary(c); 1135 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle); 1136 1137 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W), 1138 swizzle_xxxx(inst->U.I.SrcReg[0]), 1139 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)); 1140 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W), 1141 srcreg(RC_FILE_TEMPORARY, temp)); 1142 1143 r300_transform_SIN_COS_SCS(c, inst, temp); 1144 return 1; 1145 } 1146 1147 /** 1148 * Transform the trigonometric functions COS, SIN, and SCS 1149 * so that the input to COS and SIN is always in the range [-PI, PI]. 1150 * SCS is replaced by one COS and one SIN instruction. 1151 */ 1152 int r300_transform_trig_scale_vertex(struct radeon_compiler *c, 1153 struct rc_instruction *inst, 1154 void *unused) 1155 { 1156 static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979}; 1157 unsigned int temp; 1158 unsigned int constant; 1159 1160 if (inst->U.I.Opcode != RC_OPCODE_COS && 1161 inst->U.I.Opcode != RC_OPCODE_SIN && 1162 inst->U.I.Opcode != RC_OPCODE_SCS) 1163 return 0; 1164 1165 /* Repeat x in the range [-PI, PI]: 1166 * 1167 * repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI 1168 */ 1169 1170 temp = rc_find_free_temporary(c); 1171 constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons); 1172 1173 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W), 1174 swizzle_xxxx(inst->U.I.SrcReg[0]), 1175 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX), 1176 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY)); 1177 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W), 1178 srcreg(RC_FILE_TEMPORARY, temp)); 1179 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W), 1180 srcreg(RC_FILE_TEMPORARY, temp), 1181 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ), 1182 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW)); 1183 1184 r300_transform_SIN_COS_SCS(c, inst, temp); 1185 return 1; 1186 } 1187 1188 /** 1189 * Rewrite DDX/DDY instructions to properly work with r5xx shaders. 1190 * The r5xx MDH/MDV instruction provides per-quad partial derivatives. 1191 * It takes the form A*B+C. A and C are set by setting src0. B should be -1. 1192 * 1193 * @warning This explicitly changes the form of DDX and DDY! 1194 */ 1195 1196 int radeonTransformDeriv(struct radeon_compiler* c, 1197 struct rc_instruction* inst, 1198 void* unused) 1199 { 1200 if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY) 1201 return 0; 1202 1203 inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111; 1204 inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW; 1205 1206 return 1; 1207 } 1208 1209 /** 1210 * IF Temp[0].x -> IF Temp[0].x 1211 * ... -> ... 1212 * KILL -> KIL -abs(Temp[0].x) 1213 * ... -> ... 1214 * ENDIF -> ENDIF 1215 * 1216 * === OR === 1217 * 1218 * IF Temp[0].x -\ 1219 * KILL - > KIL -abs(Temp[0].x) 1220 * ENDIF -/ 1221 * 1222 * === OR === 1223 * 1224 * IF Temp[0].x -> IF Temp[0].x 1225 * ... -> ... 1226 * ELSE -> ELSE 1227 * ... -> ... 1228 * KILL -> KIL -abs(Temp[0].x) 1229 * ... -> ... 1230 * ENDIF -> ENDIF 1231 * 1232 * === OR === 1233 * 1234 * KILL -> KIL -none.1111 1235 * 1236 * This needs to be done in its own pass, because it might modify the 1237 * instructions before and after KILL. 1238 */ 1239 void rc_transform_KILL(struct radeon_compiler * c, void *user) 1240 { 1241 struct rc_instruction * inst; 1242 for (inst = c->Program.Instructions.Next; 1243 inst != &c->Program.Instructions; inst = inst->Next) { 1244 struct rc_instruction * if_inst; 1245 unsigned in_if = 0; 1246 1247 if (inst->U.I.Opcode != RC_OPCODE_KILP) 1248 continue; 1249 1250 for (if_inst = inst->Prev; if_inst != &c->Program.Instructions; 1251 if_inst = if_inst->Prev) { 1252 1253 if (if_inst->U.I.Opcode == RC_OPCODE_IF) { 1254 in_if = 1; 1255 break; 1256 } 1257 } 1258 1259 inst->U.I.Opcode = RC_OPCODE_KIL; 1260 1261 if (!in_if) { 1262 inst->U.I.SrcReg[0] = negate(builtin_one); 1263 } else { 1264 /* This should work even if the KILP is inside the ELSE 1265 * block, because -0.0 is considered negative. */ 1266 inst->U.I.SrcReg[0] = 1267 negate(absolute(if_inst->U.I.SrcReg[0])); 1268 1269 if (inst->Prev->U.I.Opcode != RC_OPCODE_IF 1270 && inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) { 1271 1272 /* Optimize the special case: 1273 * IF Temp[0].x 1274 * KILP 1275 * ENDIF 1276 */ 1277 1278 /* Remove IF */ 1279 rc_remove_instruction(inst->Prev); 1280 /* Remove ENDIF */ 1281 rc_remove_instruction(inst->Next); 1282 } 1283 } 1284 } 1285 } 1286 1287 int rc_force_output_alpha_to_one(struct radeon_compiler *c, 1288 struct rc_instruction *inst, void *data) 1289 { 1290 struct r300_fragment_program_compiler *fragc = (struct r300_fragment_program_compiler*)c; 1291 const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode); 1292 unsigned tmp; 1293 1294 if (!info->HasDstReg || inst->U.I.DstReg.File != RC_FILE_OUTPUT || 1295 inst->U.I.DstReg.Index == fragc->OutputDepth) 1296 return 1; 1297 1298 tmp = rc_find_free_temporary(c); 1299 1300 /* Insert MOV after inst, set alpha to 1. */ 1301 emit1(c, inst, RC_OPCODE_MOV, 0, inst->U.I.DstReg, 1302 srcregswz(RC_FILE_TEMPORARY, tmp, RC_SWIZZLE_XYZ1)); 1303 1304 /* Re-route the destination of inst to the source of mov. */ 1305 inst->U.I.DstReg.File = RC_FILE_TEMPORARY; 1306 inst->U.I.DstReg.Index = tmp; 1307 1308 /* Move the saturate output modifier to the MOV instruction 1309 * (for better copy propagation). */ 1310 inst->Next->U.I.SaturateMode = inst->U.I.SaturateMode; 1311 inst->U.I.SaturateMode = RC_SATURATE_NONE; 1312 return 1; 1313 } 1314