1 /* 2 * Copyright 2009 Nicolai Hhnle <nhaehnle (at) gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */ 22 23 #include "radeon_compiler.h" 24 25 #include <stdio.h> 26 27 #include "r300_reg.h" 28 29 #include "radeon_compiler_util.h" 30 #include "radeon_dataflow.h" 31 #include "radeon_program.h" 32 #include "radeon_program_alu.h" 33 #include "radeon_swizzle.h" 34 #include "radeon_emulate_branches.h" 35 #include "radeon_emulate_loops.h" 36 #include "radeon_remove_constants.h" 37 38 /* 39 * Take an already-setup and valid source then swizzle it appropriately to 40 * obtain a constant ZERO or ONE source. 41 */ 42 #define __CONST(x, y) \ 43 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \ 44 t_swizzle(y), \ 45 t_swizzle(y), \ 46 t_swizzle(y), \ 47 t_swizzle(y), \ 48 t_src_class(vpi->SrcReg[x].File), \ 49 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4)) 50 51 52 static unsigned long t_dst_mask(unsigned int mask) 53 { 54 /* RC_MASK_* is equivalent to VSF_FLAG_* */ 55 return mask & RC_MASK_XYZW; 56 } 57 58 static unsigned long t_dst_class(rc_register_file file) 59 { 60 switch (file) { 61 default: 62 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); 63 /* fall-through */ 64 case RC_FILE_TEMPORARY: 65 return PVS_DST_REG_TEMPORARY; 66 case RC_FILE_OUTPUT: 67 return PVS_DST_REG_OUT; 68 case RC_FILE_ADDRESS: 69 return PVS_DST_REG_A0; 70 } 71 } 72 73 static unsigned long t_dst_index(struct r300_vertex_program_code *vp, 74 struct rc_dst_register *dst) 75 { 76 if (dst->File == RC_FILE_OUTPUT) 77 return vp->outputs[dst->Index]; 78 79 return dst->Index; 80 } 81 82 static unsigned long t_src_class(rc_register_file file) 83 { 84 switch (file) { 85 default: 86 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); 87 /* fall-through */ 88 case RC_FILE_NONE: 89 case RC_FILE_TEMPORARY: 90 return PVS_SRC_REG_TEMPORARY; 91 case RC_FILE_INPUT: 92 return PVS_SRC_REG_INPUT; 93 case RC_FILE_CONSTANT: 94 return PVS_SRC_REG_CONSTANT; 95 } 96 } 97 98 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b) 99 { 100 unsigned long aclass = t_src_class(a.File); 101 unsigned long bclass = t_src_class(b.File); 102 103 if (aclass != bclass) 104 return 0; 105 if (aclass == PVS_SRC_REG_TEMPORARY) 106 return 0; 107 108 if (a.RelAddr || b.RelAddr) 109 return 1; 110 if (a.Index != b.Index) 111 return 1; 112 113 return 0; 114 } 115 116 static inline unsigned long t_swizzle(unsigned int swizzle) 117 { 118 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */ 119 return swizzle; 120 } 121 122 static unsigned long t_src_index(struct r300_vertex_program_code *vp, 123 struct rc_src_register *src) 124 { 125 if (src->File == RC_FILE_INPUT) { 126 assert(vp->inputs[src->Index] != -1); 127 return vp->inputs[src->Index]; 128 } else { 129 if (src->Index < 0) { 130 fprintf(stderr, 131 "negative offsets for indirect addressing do not work.\n"); 132 return 0; 133 } 134 return src->Index; 135 } 136 } 137 138 /* these two functions should probably be merged... */ 139 140 static unsigned long t_src(struct r300_vertex_program_code *vp, 141 struct rc_src_register *src) 142 { 143 /* src->Negate uses the RC_MASK_ flags from program_instruction.h, 144 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. 145 */ 146 return PVS_SRC_OPERAND(t_src_index(vp, src), 147 t_swizzle(GET_SWZ(src->Swizzle, 0)), 148 t_swizzle(GET_SWZ(src->Swizzle, 1)), 149 t_swizzle(GET_SWZ(src->Swizzle, 2)), 150 t_swizzle(GET_SWZ(src->Swizzle, 3)), 151 t_src_class(src->File), 152 src->Negate) | 153 (src->RelAddr << 4) | (src->Abs << 3); 154 } 155 156 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp, 157 struct rc_src_register *src) 158 { 159 /* src->Negate uses the RC_MASK_ flags from program_instruction.h, 160 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. 161 */ 162 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle); 163 164 return PVS_SRC_OPERAND(t_src_index(vp, src), 165 t_swizzle(swz), 166 t_swizzle(swz), 167 t_swizzle(swz), 168 t_swizzle(swz), 169 t_src_class(src->File), 170 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 171 (src->RelAddr << 4) | (src->Abs << 3); 172 } 173 174 static int valid_dst(struct r300_vertex_program_code *vp, 175 struct rc_dst_register *dst) 176 { 177 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) { 178 return 0; 179 } else if (dst->File == RC_FILE_ADDRESS) { 180 assert(dst->Index == 0); 181 } 182 183 return 1; 184 } 185 186 static void ei_vector1(struct r300_vertex_program_code *vp, 187 unsigned int hw_opcode, 188 struct rc_sub_instruction *vpi, 189 unsigned int * inst) 190 { 191 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 192 0, 193 0, 194 t_dst_index(vp, &vpi->DstReg), 195 t_dst_mask(vpi->DstReg.WriteMask), 196 t_dst_class(vpi->DstReg.File), 197 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 198 inst[1] = t_src(vp, &vpi->SrcReg[0]); 199 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 200 inst[3] = __CONST(0, RC_SWIZZLE_ZERO); 201 } 202 203 static void ei_vector2(struct r300_vertex_program_code *vp, 204 unsigned int hw_opcode, 205 struct rc_sub_instruction *vpi, 206 unsigned int * inst) 207 { 208 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 209 0, 210 0, 211 t_dst_index(vp, &vpi->DstReg), 212 t_dst_mask(vpi->DstReg.WriteMask), 213 t_dst_class(vpi->DstReg.File), 214 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 215 inst[1] = t_src(vp, &vpi->SrcReg[0]); 216 inst[2] = t_src(vp, &vpi->SrcReg[1]); 217 inst[3] = __CONST(1, RC_SWIZZLE_ZERO); 218 } 219 220 static void ei_math1(struct r300_vertex_program_code *vp, 221 unsigned int hw_opcode, 222 struct rc_sub_instruction *vpi, 223 unsigned int * inst) 224 { 225 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 226 1, 227 0, 228 t_dst_index(vp, &vpi->DstReg), 229 t_dst_mask(vpi->DstReg.WriteMask), 230 t_dst_class(vpi->DstReg.File), 231 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 232 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); 233 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 234 inst[3] = __CONST(0, RC_SWIZZLE_ZERO); 235 } 236 237 static void ei_lit(struct r300_vertex_program_code *vp, 238 struct rc_sub_instruction *vpi, 239 unsigned int * inst) 240 { 241 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W} 242 243 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX, 244 1, 245 0, 246 t_dst_index(vp, &vpi->DstReg), 247 t_dst_mask(vpi->DstReg.WriteMask), 248 t_dst_class(vpi->DstReg.File), 249 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 250 /* NOTE: Users swizzling might not work. */ 251 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 252 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 253 PVS_SRC_SELECT_FORCE_0, // Z 254 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 255 t_src_class(vpi->SrcReg[0].File), 256 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 257 (vpi->SrcReg[0].RelAddr << 4); 258 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 259 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 260 PVS_SRC_SELECT_FORCE_0, // Z 261 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 262 t_src_class(vpi->SrcReg[0].File), 263 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 264 (vpi->SrcReg[0].RelAddr << 4); 265 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 266 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 267 PVS_SRC_SELECT_FORCE_0, // Z 268 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 269 t_src_class(vpi->SrcReg[0].File), 270 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 271 (vpi->SrcReg[0].RelAddr << 4); 272 } 273 274 static void ei_mad(struct r300_vertex_program_code *vp, 275 struct rc_sub_instruction *vpi, 276 unsigned int * inst) 277 { 278 unsigned int i; 279 /* Remarks about hardware limitations of MAD 280 * (please preserve this comment, as this information is _NOT_ 281 * in the documentation provided by AMD). 282 * 283 * As described in the documentation, MAD with three unique temporary 284 * source registers requires the use of the macro version. 285 * 286 * However (and this is not mentioned in the documentation), apparently 287 * the macro version is _NOT_ a full superset of the normal version. 288 * In particular, the macro version does not always work when relative 289 * addressing is used in the source operands. 290 * 291 * This limitation caused incorrect rendering in Sauerbraten's OpenGL 292 * assembly shader path when using medium quality animations 293 * (i.e. animations with matrix blending instead of quaternion blending). 294 * 295 * Unfortunately, I (nha) have been unable to extract a Piglit regression 296 * test for this issue - for some reason, it is possible to have vertex 297 * programs whose prefix is *exactly* the same as the prefix of the 298 * offending program in Sauerbraten up to the offending instruction 299 * without causing any trouble. 300 * 301 * Bottom line: Only use the macro version only when really necessary; 302 * according to AMD docs, this should improve performance by one clock 303 * as a nice side bonus. 304 */ 305 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY && 306 vpi->SrcReg[1].File == RC_FILE_TEMPORARY && 307 vpi->SrcReg[2].File == RC_FILE_TEMPORARY && 308 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index && 309 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index && 310 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) { 311 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, 312 0, 313 1, 314 t_dst_index(vp, &vpi->DstReg), 315 t_dst_mask(vpi->DstReg.WriteMask), 316 t_dst_class(vpi->DstReg.File), 317 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 318 } else { 319 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD, 320 0, 321 0, 322 t_dst_index(vp, &vpi->DstReg), 323 t_dst_mask(vpi->DstReg.WriteMask), 324 t_dst_class(vpi->DstReg.File), 325 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 326 327 /* Arguments with constant swizzles still count as a unique 328 * temporary, so we should make sure these arguments share a 329 * register index with one of the other arguments. */ 330 for (i = 0; i < 3; i++) { 331 unsigned int j; 332 if (vpi->SrcReg[i].File != RC_FILE_NONE) 333 continue; 334 335 for (j = 0; j < 3; j++) { 336 if (i != j) { 337 vpi->SrcReg[i].Index = 338 vpi->SrcReg[j].Index; 339 break; 340 } 341 } 342 } 343 } 344 inst[1] = t_src(vp, &vpi->SrcReg[0]); 345 inst[2] = t_src(vp, &vpi->SrcReg[1]); 346 inst[3] = t_src(vp, &vpi->SrcReg[2]); 347 } 348 349 static void ei_pow(struct r300_vertex_program_code *vp, 350 struct rc_sub_instruction *vpi, 351 unsigned int * inst) 352 { 353 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF, 354 1, 355 0, 356 t_dst_index(vp, &vpi->DstReg), 357 t_dst_mask(vpi->DstReg.WriteMask), 358 t_dst_class(vpi->DstReg.File), 359 vpi->SaturateMode == RC_SATURATE_ZERO_ONE); 360 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); 361 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 362 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]); 363 } 364 365 static void translate_vertex_program(struct radeon_compiler *c, void *user) 366 { 367 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; 368 struct rc_instruction *rci; 369 370 unsigned loops[R500_PVS_MAX_LOOP_DEPTH]; 371 unsigned loop_depth = 0; 372 373 compiler->code->pos_end = 0; /* Not supported yet */ 374 compiler->code->length = 0; 375 compiler->code->num_temporaries = 0; 376 377 compiler->SetHwInputOutput(compiler); 378 379 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) { 380 struct rc_sub_instruction *vpi = &rci->U.I; 381 unsigned int *inst = compiler->code->body.d + compiler->code->length; 382 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode); 383 384 /* Skip instructions writing to non-existing destination */ 385 if (!valid_dst(compiler->code, &vpi->DstReg)) 386 continue; 387 388 if (info->HasDstReg) { 389 /* Neither is Saturate. */ 390 if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) { 391 rc_error(&compiler->Base, "Vertex program does not support the Saturate " 392 "modifier (yet).\n"); 393 } 394 } 395 396 if (compiler->code->length >= c->max_alu_insts * 4) { 397 rc_error(&compiler->Base, "Vertex program has too many instructions\n"); 398 return; 399 } 400 401 assert(compiler->Base.is_r500 || 402 (vpi->Opcode != RC_OPCODE_SEQ && 403 vpi->Opcode != RC_OPCODE_SNE)); 404 405 switch (vpi->Opcode) { 406 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break; 407 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break; 408 case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break; 409 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; 410 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; 411 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; 412 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; 413 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break; 414 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break; 415 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break; 416 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break; 417 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break; 418 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break; 419 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break; 420 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break; 421 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break; 422 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break; 423 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break; 424 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break; 425 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break; 426 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break; 427 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break; 428 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break; 429 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break; 430 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break; 431 case RC_OPCODE_BGNLOOP: 432 { 433 if ((!compiler->Base.is_r500 434 && loop_depth >= R300_VS_MAX_LOOP_DEPTH) 435 || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) { 436 rc_error(&compiler->Base, 437 "Loops are nested too deep."); 438 return; 439 } 440 loops[loop_depth++] = ((compiler->code->length)/ 4) + 1; 441 break; 442 } 443 case RC_OPCODE_ENDLOOP: 444 { 445 unsigned int act_addr; 446 unsigned int last_addr; 447 unsigned int ret_addr; 448 449 ret_addr = loops[--loop_depth]; 450 act_addr = ret_addr - 1; 451 last_addr = (compiler->code->length / 4) - 1; 452 453 if (loop_depth >= R300_VS_MAX_FC_OPS) { 454 rc_error(&compiler->Base, 455 "Too many flow control instructions."); 456 return; 457 } 458 if (compiler->Base.is_r500) { 459 compiler->code->fc_op_addrs.r500 460 [compiler->code->num_fc_ops].lw = 461 R500_PVS_FC_ACT_ADRS(act_addr) 462 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff) 463 ; 464 compiler->code->fc_op_addrs.r500 465 [compiler->code->num_fc_ops].uw = 466 R500_PVS_FC_LAST_INST(last_addr) 467 | R500_PVS_FC_RTN_INST(ret_addr) 468 ; 469 } else { 470 compiler->code->fc_op_addrs.r300 471 [compiler->code->num_fc_ops] = 472 R300_PVS_FC_ACT_ADRS(act_addr) 473 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff) 474 | R300_PVS_FC_LAST_INST(last_addr) 475 | R300_PVS_FC_RTN_INST(ret_addr) 476 ; 477 } 478 compiler->code->fc_loop_index[compiler->code->num_fc_ops] = 479 R300_PVS_FC_LOOP_INIT_VAL(0x0) 480 | R300_PVS_FC_LOOP_STEP_VAL(0x1) 481 ; 482 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP( 483 compiler->code->num_fc_ops); 484 compiler->code->num_fc_ops++; 485 486 break; 487 } 488 489 case RC_ME_PRED_SET_CLR: 490 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst); 491 break; 492 493 case RC_ME_PRED_SET_INV: 494 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst); 495 break; 496 497 case RC_ME_PRED_SET_POP: 498 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst); 499 break; 500 501 case RC_ME_PRED_SET_RESTORE: 502 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst); 503 break; 504 505 case RC_ME_PRED_SEQ: 506 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst); 507 break; 508 509 case RC_ME_PRED_SNEQ: 510 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst); 511 break; 512 513 case RC_VE_PRED_SNEQ_PUSH: 514 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH, 515 vpi, inst); 516 break; 517 518 default: 519 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name); 520 return; 521 } 522 523 if (vpi->DstReg.Pred != RC_PRED_DISABLED) { 524 inst[0] |= (PVS_DST_PRED_ENABLE_MASK 525 << PVS_DST_PRED_ENABLE_SHIFT); 526 if (vpi->DstReg.Pred == RC_PRED_SET) { 527 inst[0] |= (PVS_DST_PRED_SENSE_MASK 528 << PVS_DST_PRED_SENSE_SHIFT); 529 } 530 } 531 532 /* Update the number of temporaries. */ 533 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY && 534 vpi->DstReg.Index >= compiler->code->num_temporaries) 535 compiler->code->num_temporaries = vpi->DstReg.Index + 1; 536 537 for (unsigned i = 0; i < info->NumSrcRegs; i++) 538 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY && 539 vpi->SrcReg[i].Index >= compiler->code->num_temporaries) 540 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1; 541 542 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) { 543 rc_error(&compiler->Base, "Too many temporaries.\n"); 544 return; 545 } 546 547 compiler->code->length += 4; 548 549 if (compiler->Base.Error) 550 return; 551 } 552 } 553 554 struct temporary_allocation { 555 unsigned int Allocated:1; 556 unsigned int HwTemp:15; 557 struct rc_instruction * LastRead; 558 }; 559 560 static void allocate_temporary_registers(struct radeon_compiler *c, void *user) 561 { 562 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; 563 struct rc_instruction *inst; 564 struct rc_instruction *end_loop = NULL; 565 unsigned int num_orig_temps = 0; 566 char hwtemps[RC_REGISTER_MAX_INDEX]; 567 struct temporary_allocation * ta; 568 unsigned int i, j; 569 570 memset(hwtemps, 0, sizeof(hwtemps)); 571 572 rc_recompute_ips(c); 573 574 /* Pass 1: Count original temporaries. */ 575 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 576 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 577 578 for (i = 0; i < opcode->NumSrcRegs; ++i) { 579 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 580 if (inst->U.I.SrcReg[i].Index >= num_orig_temps) 581 num_orig_temps = inst->U.I.SrcReg[i].Index + 1; 582 } 583 } 584 585 if (opcode->HasDstReg) { 586 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { 587 if (inst->U.I.DstReg.Index >= num_orig_temps) 588 num_orig_temps = inst->U.I.DstReg.Index + 1; 589 } 590 } 591 } 592 593 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool, 594 sizeof(struct temporary_allocation) * num_orig_temps); 595 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps); 596 597 /* Pass 2: Determine original temporary lifetimes */ 598 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 599 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 600 /* Instructions inside of loops need to use the ENDLOOP 601 * instruction as their LastRead. */ 602 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) { 603 int endloops = 1; 604 struct rc_instruction * ptr; 605 for(ptr = inst->Next; 606 ptr != &compiler->Base.Program.Instructions; 607 ptr = ptr->Next){ 608 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) { 609 endloops++; 610 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) { 611 endloops--; 612 if (endloops <= 0) { 613 end_loop = ptr; 614 break; 615 } 616 } 617 } 618 } 619 620 if (inst == end_loop) { 621 end_loop = NULL; 622 continue; 623 } 624 625 for (i = 0; i < opcode->NumSrcRegs; ++i) { 626 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 627 ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst; 628 } 629 } 630 } 631 632 /* Pass 3: Register allocation */ 633 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 634 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 635 636 for (i = 0; i < opcode->NumSrcRegs; ++i) { 637 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 638 unsigned int orig = inst->U.I.SrcReg[i].Index; 639 inst->U.I.SrcReg[i].Index = ta[orig].HwTemp; 640 641 if (ta[orig].Allocated && inst == ta[orig].LastRead) 642 hwtemps[ta[orig].HwTemp] = 0; 643 } 644 } 645 646 if (opcode->HasDstReg) { 647 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { 648 unsigned int orig = inst->U.I.DstReg.Index; 649 650 if (!ta[orig].Allocated) { 651 for(j = 0; j < c->max_temp_regs; ++j) { 652 if (!hwtemps[j]) 653 break; 654 } 655 ta[orig].Allocated = 1; 656 ta[orig].HwTemp = j; 657 hwtemps[ta[orig].HwTemp] = 1; 658 } 659 660 inst->U.I.DstReg.Index = ta[orig].HwTemp; 661 } 662 } 663 } 664 } 665 666 /** 667 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier 668 * and the Saturate opcode modifier. Only Absolute is currently transformed. 669 */ 670 static int transform_nonnative_modifiers( 671 struct radeon_compiler *c, 672 struct rc_instruction *inst, 673 void* unused) 674 { 675 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode); 676 unsigned i; 677 678 /* Transform ABS(a) to MAX(a, -a). */ 679 for (i = 0; i < opcode->NumSrcRegs; i++) { 680 if (inst->U.I.SrcReg[i].Abs) { 681 struct rc_instruction *new_inst; 682 unsigned temp; 683 684 inst->U.I.SrcReg[i].Abs = 0; 685 686 temp = rc_find_free_temporary(c); 687 688 new_inst = rc_insert_new_instruction(c, inst->Prev); 689 new_inst->U.I.Opcode = RC_OPCODE_MAX; 690 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY; 691 new_inst->U.I.DstReg.Index = temp; 692 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i]; 693 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i]; 694 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; 695 696 memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i])); 697 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY; 698 inst->U.I.SrcReg[i].Index = temp; 699 inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW; 700 } 701 } 702 return 1; 703 } 704 705 /** 706 * Vertex engine cannot read two inputs or two constants at the same time. 707 * Introduce intermediate MOVs to temporary registers to account for this. 708 */ 709 static int transform_source_conflicts( 710 struct radeon_compiler *c, 711 struct rc_instruction* inst, 712 void* unused) 713 { 714 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 715 716 if (opcode->NumSrcRegs == 3) { 717 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]) 718 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) { 719 int tmpreg = rc_find_free_temporary(c); 720 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); 721 inst_mov->U.I.Opcode = RC_OPCODE_MOV; 722 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; 723 inst_mov->U.I.DstReg.Index = tmpreg; 724 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2]; 725 726 reset_srcreg(&inst->U.I.SrcReg[2]); 727 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY; 728 inst->U.I.SrcReg[2].Index = tmpreg; 729 } 730 } 731 732 if (opcode->NumSrcRegs >= 2) { 733 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) { 734 int tmpreg = rc_find_free_temporary(c); 735 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); 736 inst_mov->U.I.Opcode = RC_OPCODE_MOV; 737 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; 738 inst_mov->U.I.DstReg.Index = tmpreg; 739 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1]; 740 741 reset_srcreg(&inst->U.I.SrcReg[1]); 742 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY; 743 inst->U.I.SrcReg[1].Index = tmpreg; 744 } 745 } 746 747 return 1; 748 } 749 750 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user) 751 { 752 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c; 753 int i; 754 755 for(i = 0; i < 32; ++i) { 756 if ((compiler->RequiredOutputs & (1 << i)) && 757 !(compiler->Base.Program.OutputsWritten & (1 << i))) { 758 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev); 759 inst->U.I.Opcode = RC_OPCODE_MOV; 760 761 inst->U.I.DstReg.File = RC_FILE_OUTPUT; 762 inst->U.I.DstReg.Index = i; 763 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW; 764 765 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT; 766 inst->U.I.SrcReg[0].Index = 0; 767 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 768 769 compiler->Base.Program.OutputsWritten |= 1 << i; 770 } 771 } 772 } 773 774 static void dataflow_outputs_mark_used(void * userdata, void * data, 775 void (*callback)(void *, unsigned int, unsigned int)) 776 { 777 struct r300_vertex_program_compiler * c = userdata; 778 int i; 779 780 for(i = 0; i < 32; ++i) { 781 if (c->RequiredOutputs & (1 << i)) 782 callback(data, i, RC_MASK_XYZW); 783 } 784 } 785 786 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg) 787 { 788 (void) opcode; 789 (void) reg; 790 791 return 1; 792 } 793 794 static void transform_negative_addressing(struct r300_vertex_program_compiler *c, 795 struct rc_instruction *arl, 796 struct rc_instruction *end, 797 int min_offset) 798 { 799 struct rc_instruction *inst, *add; 800 unsigned const_swizzle; 801 802 /* Transform ARL/ARR */ 803 add = rc_insert_new_instruction(&c->Base, arl->Prev); 804 add->U.I.Opcode = RC_OPCODE_ADD; 805 add->U.I.DstReg.File = RC_FILE_TEMPORARY; 806 add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base); 807 add->U.I.DstReg.WriteMask = RC_MASK_X; 808 add->U.I.SrcReg[0] = arl->U.I.SrcReg[0]; 809 add->U.I.SrcReg[1].File = RC_FILE_CONSTANT; 810 add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants, 811 min_offset, &const_swizzle); 812 add->U.I.SrcReg[1].Swizzle = const_swizzle; 813 814 arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY; 815 arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index; 816 arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX; 817 818 /* Rewrite offsets up to and excluding inst. */ 819 for (inst = arl->Next; inst != end; inst = inst->Next) { 820 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 821 822 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) 823 if (inst->U.I.SrcReg[i].RelAddr) 824 inst->U.I.SrcReg[i].Index -= min_offset; 825 } 826 } 827 828 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user) 829 { 830 struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler; 831 struct rc_instruction *inst, *lastARL = NULL; 832 int min_offset = 0; 833 834 for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) { 835 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 836 837 if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) { 838 if (lastARL != NULL && min_offset < 0) 839 transform_negative_addressing(c, lastARL, inst, min_offset); 840 841 lastARL = inst; 842 min_offset = 0; 843 continue; 844 } 845 846 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) { 847 if (inst->U.I.SrcReg[i].RelAddr && 848 inst->U.I.SrcReg[i].Index < 0) { 849 /* ARL must precede any indirect addressing. */ 850 if (!lastARL) { 851 rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR."); 852 return; 853 } 854 855 if (inst->U.I.SrcReg[i].Index < min_offset) 856 min_offset = inst->U.I.SrcReg[i].Index; 857 } 858 } 859 } 860 861 if (lastARL != NULL && min_offset < 0) 862 transform_negative_addressing(c, lastARL, inst, min_offset); 863 } 864 865 struct rc_swizzle_caps r300_vertprog_swizzle_caps = { 866 .IsNative = &swizzle_is_native, 867 .Split = 0 /* should never be called */ 868 }; 869 870 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) 871 { 872 int is_r500 = c->Base.is_r500; 873 int opt = !c->Base.disable_optimizations; 874 875 /* Lists of instruction transformations. */ 876 struct radeon_program_transformation alu_rewrite_r500[] = { 877 { &r300_transform_vertex_alu, 0 }, 878 { &r300_transform_trig_scale_vertex, 0 }, 879 { 0, 0 } 880 }; 881 882 struct radeon_program_transformation alu_rewrite_r300[] = { 883 { &r300_transform_vertex_alu, 0 }, 884 { &r300_transform_trig_simple, 0 }, 885 { 0, 0 } 886 }; 887 888 /* Note: These passes have to be done seperately from ALU rewrite, 889 * otherwise non-native ALU instructions with source conflits 890 * or non-native modifiers will not be treated properly. 891 */ 892 struct radeon_program_transformation emulate_modifiers[] = { 893 { &transform_nonnative_modifiers, 0 }, 894 { 0, 0 } 895 }; 896 897 struct radeon_program_transformation resolve_src_conflicts[] = { 898 { &transform_source_conflicts, 0 }, 899 { 0, 0 } 900 }; 901 902 /* List of compiler passes. */ 903 struct radeon_compiler_pass vs_list[] = { 904 /* NAME DUMP PREDICATE FUNCTION PARAM */ 905 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL}, 906 {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL}, 907 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL}, 908 {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500}, 909 {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300}, 910 {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers}, 911 {"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used}, 912 {"dataflow optimize", 1, opt, rc_optimize, NULL}, 913 /* This pass must be done after optimizations. */ 914 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts}, 915 {"register allocation", 1, opt, allocate_temporary_registers, NULL}, 916 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table}, 917 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL}, 918 {"final code validation", 0, 1, rc_validate_final_shader, NULL}, 919 {"machine code generation", 0, 1, translate_vertex_program, NULL}, 920 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL}, 921 {NULL, 0, 0, NULL, NULL} 922 }; 923 924 c->Base.type = RC_VERTEX_PROGRAM; 925 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps; 926 927 rc_run_compiler(&c->Base, vs_list); 928 929 c->code->InputsRead = c->Base.Program.InputsRead; 930 c->code->OutputsWritten = c->Base.Program.OutputsWritten; 931 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants); 932 } 933