1 /* 2 * Copyright 2009 Nicolai Hhnle <nhaehnle (at) gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. */ 22 23 #include "radeon_compiler.h" 24 25 #include <stdio.h> 26 27 #include "../r300_reg.h" 28 29 #include "radeon_compiler_util.h" 30 #include "radeon_dataflow.h" 31 #include "radeon_program.h" 32 #include "radeon_program_alu.h" 33 #include "radeon_swizzle.h" 34 #include "radeon_emulate_branches.h" 35 #include "radeon_emulate_loops.h" 36 #include "radeon_remove_constants.h" 37 38 /* 39 * Take an already-setup and valid source then swizzle it appropriately to 40 * obtain a constant ZERO or ONE source. 41 */ 42 #define __CONST(x, y) \ 43 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \ 44 t_swizzle(y), \ 45 t_swizzle(y), \ 46 t_swizzle(y), \ 47 t_swizzle(y), \ 48 t_src_class(vpi->SrcReg[x].File), \ 49 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4)) 50 51 52 static unsigned long t_dst_mask(unsigned int mask) 53 { 54 /* RC_MASK_* is equivalent to VSF_FLAG_* */ 55 return mask & RC_MASK_XYZW; 56 } 57 58 static unsigned long t_dst_class(rc_register_file file) 59 { 60 switch (file) { 61 default: 62 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); 63 /* fall-through */ 64 case RC_FILE_TEMPORARY: 65 return PVS_DST_REG_TEMPORARY; 66 case RC_FILE_OUTPUT: 67 return PVS_DST_REG_OUT; 68 case RC_FILE_ADDRESS: 69 return PVS_DST_REG_A0; 70 } 71 } 72 73 static unsigned long t_dst_index(struct r300_vertex_program_code *vp, 74 struct rc_dst_register *dst) 75 { 76 if (dst->File == RC_FILE_OUTPUT) 77 return vp->outputs[dst->Index]; 78 79 return dst->Index; 80 } 81 82 static unsigned long t_src_class(rc_register_file file) 83 { 84 switch (file) { 85 default: 86 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file); 87 /* fall-through */ 88 case RC_FILE_NONE: 89 case RC_FILE_TEMPORARY: 90 return PVS_SRC_REG_TEMPORARY; 91 case RC_FILE_INPUT: 92 return PVS_SRC_REG_INPUT; 93 case RC_FILE_CONSTANT: 94 return PVS_SRC_REG_CONSTANT; 95 } 96 } 97 98 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b) 99 { 100 unsigned long aclass = t_src_class(a.File); 101 unsigned long bclass = t_src_class(b.File); 102 103 if (aclass != bclass) 104 return 0; 105 if (aclass == PVS_SRC_REG_TEMPORARY) 106 return 0; 107 108 if (a.RelAddr || b.RelAddr) 109 return 1; 110 if (a.Index != b.Index) 111 return 1; 112 113 return 0; 114 } 115 116 static inline unsigned long t_swizzle(unsigned int swizzle) 117 { 118 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */ 119 return swizzle; 120 } 121 122 static unsigned long t_src_index(struct r300_vertex_program_code *vp, 123 struct rc_src_register *src) 124 { 125 if (src->File == RC_FILE_INPUT) { 126 assert(vp->inputs[src->Index] != -1); 127 return vp->inputs[src->Index]; 128 } else { 129 if (src->Index < 0) { 130 fprintf(stderr, 131 "negative offsets for indirect addressing do not work.\n"); 132 return 0; 133 } 134 return src->Index; 135 } 136 } 137 138 /* these two functions should probably be merged... */ 139 140 static unsigned long t_src(struct r300_vertex_program_code *vp, 141 struct rc_src_register *src) 142 { 143 /* src->Negate uses the RC_MASK_ flags from program_instruction.h, 144 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. 145 */ 146 return PVS_SRC_OPERAND(t_src_index(vp, src), 147 t_swizzle(GET_SWZ(src->Swizzle, 0)), 148 t_swizzle(GET_SWZ(src->Swizzle, 1)), 149 t_swizzle(GET_SWZ(src->Swizzle, 2)), 150 t_swizzle(GET_SWZ(src->Swizzle, 3)), 151 t_src_class(src->File), 152 src->Negate) | 153 (src->RelAddr << 4) | (src->Abs << 3); 154 } 155 156 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp, 157 struct rc_src_register *src) 158 { 159 /* src->Negate uses the RC_MASK_ flags from program_instruction.h, 160 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here. 161 */ 162 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle); 163 164 return PVS_SRC_OPERAND(t_src_index(vp, src), 165 t_swizzle(swz), 166 t_swizzle(swz), 167 t_swizzle(swz), 168 t_swizzle(swz), 169 t_src_class(src->File), 170 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 171 (src->RelAddr << 4) | (src->Abs << 3); 172 } 173 174 static int valid_dst(struct r300_vertex_program_code *vp, 175 struct rc_dst_register *dst) 176 { 177 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) { 178 return 0; 179 } else if (dst->File == RC_FILE_ADDRESS) { 180 assert(dst->Index == 0); 181 } 182 183 return 1; 184 } 185 186 static void ei_vector1(struct r300_vertex_program_code *vp, 187 unsigned int hw_opcode, 188 struct rc_sub_instruction *vpi, 189 unsigned int * inst) 190 { 191 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 192 0, 193 0, 194 t_dst_index(vp, &vpi->DstReg), 195 t_dst_mask(vpi->DstReg.WriteMask), 196 t_dst_class(vpi->DstReg.File)); 197 inst[1] = t_src(vp, &vpi->SrcReg[0]); 198 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 199 inst[3] = __CONST(0, RC_SWIZZLE_ZERO); 200 } 201 202 static void ei_vector2(struct r300_vertex_program_code *vp, 203 unsigned int hw_opcode, 204 struct rc_sub_instruction *vpi, 205 unsigned int * inst) 206 { 207 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 208 0, 209 0, 210 t_dst_index(vp, &vpi->DstReg), 211 t_dst_mask(vpi->DstReg.WriteMask), 212 t_dst_class(vpi->DstReg.File)); 213 inst[1] = t_src(vp, &vpi->SrcReg[0]); 214 inst[2] = t_src(vp, &vpi->SrcReg[1]); 215 inst[3] = __CONST(1, RC_SWIZZLE_ZERO); 216 } 217 218 static void ei_math1(struct r300_vertex_program_code *vp, 219 unsigned int hw_opcode, 220 struct rc_sub_instruction *vpi, 221 unsigned int * inst) 222 { 223 inst[0] = PVS_OP_DST_OPERAND(hw_opcode, 224 1, 225 0, 226 t_dst_index(vp, &vpi->DstReg), 227 t_dst_mask(vpi->DstReg.WriteMask), 228 t_dst_class(vpi->DstReg.File)); 229 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); 230 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 231 inst[3] = __CONST(0, RC_SWIZZLE_ZERO); 232 } 233 234 static void ei_lit(struct r300_vertex_program_code *vp, 235 struct rc_sub_instruction *vpi, 236 unsigned int * inst) 237 { 238 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W} 239 240 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX, 241 1, 242 0, 243 t_dst_index(vp, &vpi->DstReg), 244 t_dst_mask(vpi->DstReg.WriteMask), 245 t_dst_class(vpi->DstReg.File)); 246 /* NOTE: Users swizzling might not work. */ 247 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 248 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 249 PVS_SRC_SELECT_FORCE_0, // Z 250 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 251 t_src_class(vpi->SrcReg[0].File), 252 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 253 (vpi->SrcReg[0].RelAddr << 4); 254 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 255 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 256 PVS_SRC_SELECT_FORCE_0, // Z 257 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 258 t_src_class(vpi->SrcReg[0].File), 259 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 260 (vpi->SrcReg[0].RelAddr << 4); 261 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y 262 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X 263 PVS_SRC_SELECT_FORCE_0, // Z 264 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W 265 t_src_class(vpi->SrcReg[0].File), 266 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) | 267 (vpi->SrcReg[0].RelAddr << 4); 268 } 269 270 static void ei_mad(struct r300_vertex_program_code *vp, 271 struct rc_sub_instruction *vpi, 272 unsigned int * inst) 273 { 274 unsigned int i; 275 /* Remarks about hardware limitations of MAD 276 * (please preserve this comment, as this information is _NOT_ 277 * in the documentation provided by AMD). 278 * 279 * As described in the documentation, MAD with three unique temporary 280 * source registers requires the use of the macro version. 281 * 282 * However (and this is not mentioned in the documentation), apparently 283 * the macro version is _NOT_ a full superset of the normal version. 284 * In particular, the macro version does not always work when relative 285 * addressing is used in the source operands. 286 * 287 * This limitation caused incorrect rendering in Sauerbraten's OpenGL 288 * assembly shader path when using medium quality animations 289 * (i.e. animations with matrix blending instead of quaternion blending). 290 * 291 * Unfortunately, I (nha) have been unable to extract a Piglit regression 292 * test for this issue - for some reason, it is possible to have vertex 293 * programs whose prefix is *exactly* the same as the prefix of the 294 * offending program in Sauerbraten up to the offending instruction 295 * without causing any trouble. 296 * 297 * Bottom line: Only use the macro version only when really necessary; 298 * according to AMD docs, this should improve performance by one clock 299 * as a nice side bonus. 300 */ 301 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY && 302 vpi->SrcReg[1].File == RC_FILE_TEMPORARY && 303 vpi->SrcReg[2].File == RC_FILE_TEMPORARY && 304 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index && 305 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index && 306 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) { 307 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD, 308 0, 309 1, 310 t_dst_index(vp, &vpi->DstReg), 311 t_dst_mask(vpi->DstReg.WriteMask), 312 t_dst_class(vpi->DstReg.File)); 313 } else { 314 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD, 315 0, 316 0, 317 t_dst_index(vp, &vpi->DstReg), 318 t_dst_mask(vpi->DstReg.WriteMask), 319 t_dst_class(vpi->DstReg.File)); 320 321 /* Arguments with constant swizzles still count as a unique 322 * temporary, so we should make sure these arguments share a 323 * register index with one of the other arguments. */ 324 for (i = 0; i < 3; i++) { 325 unsigned int j; 326 if (vpi->SrcReg[i].File != RC_FILE_NONE) 327 continue; 328 329 for (j = 0; j < 3; j++) { 330 if (i != j) { 331 vpi->SrcReg[i].Index = 332 vpi->SrcReg[j].Index; 333 break; 334 } 335 } 336 } 337 } 338 inst[1] = t_src(vp, &vpi->SrcReg[0]); 339 inst[2] = t_src(vp, &vpi->SrcReg[1]); 340 inst[3] = t_src(vp, &vpi->SrcReg[2]); 341 } 342 343 static void ei_pow(struct r300_vertex_program_code *vp, 344 struct rc_sub_instruction *vpi, 345 unsigned int * inst) 346 { 347 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF, 348 1, 349 0, 350 t_dst_index(vp, &vpi->DstReg), 351 t_dst_mask(vpi->DstReg.WriteMask), 352 t_dst_class(vpi->DstReg.File)); 353 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]); 354 inst[2] = __CONST(0, RC_SWIZZLE_ZERO); 355 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]); 356 } 357 358 static void translate_vertex_program(struct radeon_compiler *c, void *user) 359 { 360 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; 361 struct rc_instruction *rci; 362 363 unsigned loops[R500_PVS_MAX_LOOP_DEPTH]; 364 unsigned loop_depth = 0; 365 366 compiler->code->pos_end = 0; /* Not supported yet */ 367 compiler->code->length = 0; 368 compiler->code->num_temporaries = 0; 369 370 compiler->SetHwInputOutput(compiler); 371 372 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) { 373 struct rc_sub_instruction *vpi = &rci->U.I; 374 unsigned int *inst = compiler->code->body.d + compiler->code->length; 375 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode); 376 377 /* Skip instructions writing to non-existing destination */ 378 if (!valid_dst(compiler->code, &vpi->DstReg)) 379 continue; 380 381 if (info->HasDstReg) { 382 /* Neither is Saturate. */ 383 if (vpi->SaturateMode != RC_SATURATE_NONE) { 384 rc_error(&compiler->Base, "Vertex program does not support the Saturate " 385 "modifier (yet).\n"); 386 } 387 } 388 389 if (compiler->code->length >= c->max_alu_insts * 4) { 390 rc_error(&compiler->Base, "Vertex program has too many instructions\n"); 391 return; 392 } 393 394 assert(compiler->Base.is_r500 || 395 (vpi->Opcode != RC_OPCODE_SEQ && 396 vpi->Opcode != RC_OPCODE_SNE)); 397 398 switch (vpi->Opcode) { 399 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break; 400 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break; 401 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; 402 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; 403 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; 404 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break; 405 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break; 406 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break; 407 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break; 408 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break; 409 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break; 410 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break; 411 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break; 412 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break; 413 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break; 414 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break; 415 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break; 416 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break; 417 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break; 418 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break; 419 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break; 420 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break; 421 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break; 422 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break; 423 case RC_OPCODE_BGNLOOP: 424 { 425 if ((!compiler->Base.is_r500 426 && loop_depth >= R300_VS_MAX_LOOP_DEPTH) 427 || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) { 428 rc_error(&compiler->Base, 429 "Loops are nested too deep."); 430 return; 431 } 432 loops[loop_depth++] = ((compiler->code->length)/ 4) + 1; 433 break; 434 } 435 case RC_OPCODE_ENDLOOP: 436 { 437 unsigned int act_addr; 438 unsigned int last_addr; 439 unsigned int ret_addr; 440 441 ret_addr = loops[--loop_depth]; 442 act_addr = ret_addr - 1; 443 last_addr = (compiler->code->length / 4) - 1; 444 445 if (loop_depth >= R300_VS_MAX_FC_OPS) { 446 rc_error(&compiler->Base, 447 "Too many flow control instructions."); 448 return; 449 } 450 if (compiler->Base.is_r500) { 451 compiler->code->fc_op_addrs.r500 452 [compiler->code->num_fc_ops].lw = 453 R500_PVS_FC_ACT_ADRS(act_addr) 454 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff) 455 ; 456 compiler->code->fc_op_addrs.r500 457 [compiler->code->num_fc_ops].uw = 458 R500_PVS_FC_LAST_INST(last_addr) 459 | R500_PVS_FC_RTN_INST(ret_addr) 460 ; 461 } else { 462 compiler->code->fc_op_addrs.r300 463 [compiler->code->num_fc_ops] = 464 R300_PVS_FC_ACT_ADRS(act_addr) 465 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff) 466 | R300_PVS_FC_LAST_INST(last_addr) 467 | R300_PVS_FC_RTN_INST(ret_addr) 468 ; 469 } 470 compiler->code->fc_loop_index[compiler->code->num_fc_ops] = 471 R300_PVS_FC_LOOP_INIT_VAL(0x0) 472 | R300_PVS_FC_LOOP_STEP_VAL(0x1) 473 ; 474 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP( 475 compiler->code->num_fc_ops); 476 compiler->code->num_fc_ops++; 477 478 break; 479 } 480 481 case RC_ME_PRED_SET_CLR: 482 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst); 483 break; 484 485 case RC_ME_PRED_SET_INV: 486 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst); 487 break; 488 489 case RC_ME_PRED_SET_POP: 490 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst); 491 break; 492 493 case RC_ME_PRED_SET_RESTORE: 494 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst); 495 break; 496 497 case RC_ME_PRED_SEQ: 498 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst); 499 break; 500 501 case RC_ME_PRED_SNEQ: 502 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst); 503 break; 504 505 case RC_VE_PRED_SNEQ_PUSH: 506 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH, 507 vpi, inst); 508 break; 509 510 default: 511 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name); 512 return; 513 } 514 515 if (vpi->DstReg.Pred != RC_PRED_DISABLED) { 516 inst[0] |= (PVS_DST_PRED_ENABLE_MASK 517 << PVS_DST_PRED_ENABLE_SHIFT); 518 if (vpi->DstReg.Pred == RC_PRED_SET) { 519 inst[0] |= (PVS_DST_PRED_SENSE_MASK 520 << PVS_DST_PRED_SENSE_SHIFT); 521 } 522 } 523 524 /* Update the number of temporaries. */ 525 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY && 526 vpi->DstReg.Index >= compiler->code->num_temporaries) 527 compiler->code->num_temporaries = vpi->DstReg.Index + 1; 528 529 for (unsigned i = 0; i < info->NumSrcRegs; i++) 530 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY && 531 vpi->SrcReg[i].Index >= compiler->code->num_temporaries) 532 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1; 533 534 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) { 535 rc_error(&compiler->Base, "Too many temporaries.\n"); 536 return; 537 } 538 539 compiler->code->length += 4; 540 541 if (compiler->Base.Error) 542 return; 543 } 544 } 545 546 struct temporary_allocation { 547 unsigned int Allocated:1; 548 unsigned int HwTemp:15; 549 struct rc_instruction * LastRead; 550 }; 551 552 static void allocate_temporary_registers(struct radeon_compiler *c, void *user) 553 { 554 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; 555 struct rc_instruction *inst; 556 struct rc_instruction *end_loop = NULL; 557 unsigned int num_orig_temps = 0; 558 char hwtemps[RC_REGISTER_MAX_INDEX]; 559 struct temporary_allocation * ta; 560 unsigned int i, j; 561 562 memset(hwtemps, 0, sizeof(hwtemps)); 563 564 rc_recompute_ips(c); 565 566 /* Pass 1: Count original temporaries. */ 567 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 568 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 569 570 for (i = 0; i < opcode->NumSrcRegs; ++i) { 571 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 572 if (inst->U.I.SrcReg[i].Index >= num_orig_temps) 573 num_orig_temps = inst->U.I.SrcReg[i].Index + 1; 574 } 575 } 576 577 if (opcode->HasDstReg) { 578 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { 579 if (inst->U.I.DstReg.Index >= num_orig_temps) 580 num_orig_temps = inst->U.I.DstReg.Index + 1; 581 } 582 } 583 } 584 585 ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool, 586 sizeof(struct temporary_allocation) * num_orig_temps); 587 memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps); 588 589 /* Pass 2: Determine original temporary lifetimes */ 590 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 591 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 592 /* Instructions inside of loops need to use the ENDLOOP 593 * instruction as their LastRead. */ 594 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) { 595 int endloops = 1; 596 struct rc_instruction * ptr; 597 for(ptr = inst->Next; 598 ptr != &compiler->Base.Program.Instructions; 599 ptr = ptr->Next){ 600 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) { 601 endloops++; 602 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) { 603 endloops--; 604 if (endloops <= 0) { 605 end_loop = ptr; 606 break; 607 } 608 } 609 } 610 } 611 612 if (inst == end_loop) { 613 end_loop = NULL; 614 continue; 615 } 616 617 for (i = 0; i < opcode->NumSrcRegs; ++i) { 618 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 619 ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst; 620 } 621 } 622 } 623 624 /* Pass 3: Register allocation */ 625 for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { 626 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 627 628 for (i = 0; i < opcode->NumSrcRegs; ++i) { 629 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { 630 unsigned int orig = inst->U.I.SrcReg[i].Index; 631 inst->U.I.SrcReg[i].Index = ta[orig].HwTemp; 632 633 if (ta[orig].Allocated && inst == ta[orig].LastRead) 634 hwtemps[ta[orig].HwTemp] = 0; 635 } 636 } 637 638 if (opcode->HasDstReg) { 639 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { 640 unsigned int orig = inst->U.I.DstReg.Index; 641 642 if (!ta[orig].Allocated) { 643 for(j = 0; j < c->max_temp_regs; ++j) { 644 if (!hwtemps[j]) 645 break; 646 } 647 ta[orig].Allocated = 1; 648 ta[orig].HwTemp = j; 649 hwtemps[ta[orig].HwTemp] = 1; 650 } 651 652 inst->U.I.DstReg.Index = ta[orig].HwTemp; 653 } 654 } 655 } 656 } 657 658 /** 659 * R3xx-R4xx vertex engine does not support the Absolute source operand modifier 660 * and the Saturate opcode modifier. Only Absolute is currently transformed. 661 */ 662 static int transform_nonnative_modifiers( 663 struct radeon_compiler *c, 664 struct rc_instruction *inst, 665 void* unused) 666 { 667 const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode); 668 unsigned i; 669 670 /* Transform ABS(a) to MAX(a, -a). */ 671 for (i = 0; i < opcode->NumSrcRegs; i++) { 672 if (inst->U.I.SrcReg[i].Abs) { 673 struct rc_instruction *new_inst; 674 unsigned temp; 675 676 inst->U.I.SrcReg[i].Abs = 0; 677 678 temp = rc_find_free_temporary(c); 679 680 new_inst = rc_insert_new_instruction(c, inst->Prev); 681 new_inst->U.I.Opcode = RC_OPCODE_MAX; 682 new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY; 683 new_inst->U.I.DstReg.Index = temp; 684 new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i]; 685 new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i]; 686 new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW; 687 688 memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i])); 689 inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY; 690 inst->U.I.SrcReg[i].Index = temp; 691 inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW; 692 } 693 } 694 return 1; 695 } 696 697 /** 698 * Vertex engine cannot read two inputs or two constants at the same time. 699 * Introduce intermediate MOVs to temporary registers to account for this. 700 */ 701 static int transform_source_conflicts( 702 struct radeon_compiler *c, 703 struct rc_instruction* inst, 704 void* unused) 705 { 706 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 707 708 if (opcode->NumSrcRegs == 3) { 709 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]) 710 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) { 711 int tmpreg = rc_find_free_temporary(c); 712 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); 713 inst_mov->U.I.Opcode = RC_OPCODE_MOV; 714 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; 715 inst_mov->U.I.DstReg.Index = tmpreg; 716 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2]; 717 718 reset_srcreg(&inst->U.I.SrcReg[2]); 719 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY; 720 inst->U.I.SrcReg[2].Index = tmpreg; 721 } 722 } 723 724 if (opcode->NumSrcRegs >= 2) { 725 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) { 726 int tmpreg = rc_find_free_temporary(c); 727 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev); 728 inst_mov->U.I.Opcode = RC_OPCODE_MOV; 729 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY; 730 inst_mov->U.I.DstReg.Index = tmpreg; 731 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1]; 732 733 reset_srcreg(&inst->U.I.SrcReg[1]); 734 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY; 735 inst->U.I.SrcReg[1].Index = tmpreg; 736 } 737 } 738 739 return 1; 740 } 741 742 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user) 743 { 744 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c; 745 int i; 746 747 for(i = 0; i < 32; ++i) { 748 if ((compiler->RequiredOutputs & (1 << i)) && 749 !(compiler->Base.Program.OutputsWritten & (1 << i))) { 750 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev); 751 inst->U.I.Opcode = RC_OPCODE_MOV; 752 753 inst->U.I.DstReg.File = RC_FILE_OUTPUT; 754 inst->U.I.DstReg.Index = i; 755 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW; 756 757 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT; 758 inst->U.I.SrcReg[0].Index = 0; 759 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW; 760 761 compiler->Base.Program.OutputsWritten |= 1 << i; 762 } 763 } 764 } 765 766 static void dataflow_outputs_mark_used(void * userdata, void * data, 767 void (*callback)(void *, unsigned int, unsigned int)) 768 { 769 struct r300_vertex_program_compiler * c = userdata; 770 int i; 771 772 for(i = 0; i < 32; ++i) { 773 if (c->RequiredOutputs & (1 << i)) 774 callback(data, i, RC_MASK_XYZW); 775 } 776 } 777 778 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg) 779 { 780 (void) opcode; 781 (void) reg; 782 783 return 1; 784 } 785 786 static void transform_negative_addressing(struct r300_vertex_program_compiler *c, 787 struct rc_instruction *arl, 788 struct rc_instruction *end, 789 int min_offset) 790 { 791 struct rc_instruction *inst, *add; 792 unsigned const_swizzle; 793 794 /* Transform ARL */ 795 add = rc_insert_new_instruction(&c->Base, arl->Prev); 796 add->U.I.Opcode = RC_OPCODE_ADD; 797 add->U.I.DstReg.File = RC_FILE_TEMPORARY; 798 add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base); 799 add->U.I.DstReg.WriteMask = RC_MASK_X; 800 add->U.I.SrcReg[0] = arl->U.I.SrcReg[0]; 801 add->U.I.SrcReg[1].File = RC_FILE_CONSTANT; 802 add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants, 803 min_offset, &const_swizzle); 804 add->U.I.SrcReg[1].Swizzle = const_swizzle; 805 806 arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY; 807 arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index; 808 arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX; 809 810 /* Rewrite offsets up to and excluding inst. */ 811 for (inst = arl->Next; inst != end; inst = inst->Next) { 812 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 813 814 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) 815 if (inst->U.I.SrcReg[i].RelAddr) 816 inst->U.I.SrcReg[i].Index -= min_offset; 817 } 818 } 819 820 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user) 821 { 822 struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler; 823 struct rc_instruction *inst, *lastARL = NULL; 824 int min_offset = 0; 825 826 for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) { 827 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); 828 829 if (inst->U.I.Opcode == RC_OPCODE_ARL) { 830 if (lastARL != NULL && min_offset < 0) 831 transform_negative_addressing(c, lastARL, inst, min_offset); 832 833 lastARL = inst; 834 min_offset = 0; 835 continue; 836 } 837 838 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) { 839 if (inst->U.I.SrcReg[i].RelAddr && 840 inst->U.I.SrcReg[i].Index < 0) { 841 /* ARL must precede any indirect addressing. */ 842 if (lastARL == NULL) { 843 rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL."); 844 return; 845 } 846 847 if (inst->U.I.SrcReg[i].Index < min_offset) 848 min_offset = inst->U.I.SrcReg[i].Index; 849 } 850 } 851 } 852 853 if (lastARL != NULL && min_offset < 0) 854 transform_negative_addressing(c, lastARL, inst, min_offset); 855 } 856 857 static struct rc_swizzle_caps r300_vertprog_swizzle_caps = { 858 .IsNative = &swizzle_is_native, 859 .Split = 0 /* should never be called */ 860 }; 861 862 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) 863 { 864 int is_r500 = c->Base.is_r500; 865 int opt = !c->Base.disable_optimizations; 866 867 /* Lists of instruction transformations. */ 868 struct radeon_program_transformation alu_rewrite_r500[] = { 869 { &r300_transform_vertex_alu, 0 }, 870 { &r300_transform_trig_scale_vertex, 0 }, 871 { 0, 0 } 872 }; 873 874 struct radeon_program_transformation alu_rewrite_r300[] = { 875 { &r300_transform_vertex_alu, 0 }, 876 { &r300_transform_trig_simple, 0 }, 877 { 0, 0 } 878 }; 879 880 /* Note: These passes have to be done seperately from ALU rewrite, 881 * otherwise non-native ALU instructions with source conflits 882 * or non-native modifiers will not be treated properly. 883 */ 884 struct radeon_program_transformation emulate_modifiers[] = { 885 { &transform_nonnative_modifiers, 0 }, 886 { 0, 0 } 887 }; 888 889 struct radeon_program_transformation resolve_src_conflicts[] = { 890 { &transform_source_conflicts, 0 }, 891 { 0, 0 } 892 }; 893 894 /* List of compiler passes. */ 895 struct radeon_compiler_pass vs_list[] = { 896 /* NAME DUMP PREDICATE FUNCTION PARAM */ 897 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL}, 898 {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL}, 899 {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL}, 900 {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500}, 901 {"native rewrite", 1, !is_r500, rc_local_transform, alu_rewrite_r300}, 902 {"emulate modifiers", 1, !is_r500, rc_local_transform, emulate_modifiers}, 903 {"deadcode", 1, opt, rc_dataflow_deadcode, dataflow_outputs_mark_used}, 904 {"dataflow optimize", 1, opt, rc_optimize, NULL}, 905 /* This pass must be done after optimizations. */ 906 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts}, 907 {"register allocation", 1, opt, allocate_temporary_registers, NULL}, 908 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table}, 909 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL}, 910 {"final code validation", 0, 1, rc_validate_final_shader, NULL}, 911 {"machine code generation", 0, 1, translate_vertex_program, NULL}, 912 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL}, 913 {NULL, 0, 0, NULL, NULL} 914 }; 915 916 c->Base.type = RC_VERTEX_PROGRAM; 917 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps; 918 919 rc_run_compiler(&c->Base, vs_list); 920 921 c->code->InputsRead = c->Base.Program.InputsRead; 922 c->code->OutputsWritten = c->Base.Program.OutputsWritten; 923 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants); 924 } 925