1 /* 2 * Copyright 2011 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "codegen/nv50_ir.h" 24 #include "codegen/nv50_ir_target.h" 25 #include "codegen/nv50_ir_build_util.h" 26 27 extern "C" { 28 #include "util/u_math.h" 29 } 30 31 namespace nv50_ir { 32 33 bool 34 Instruction::isNop() const 35 { 36 if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT) 37 return true; 38 if (terminator || join) // XXX: should terminator imply flow ? 39 return false; 40 if (op == OP_ATOM) 41 return false; 42 if (!fixed && op == OP_NOP) 43 return true; 44 45 if (defExists(0) && def(0).rep()->reg.data.id < 0) { 46 for (int d = 1; defExists(d); ++d) 47 if (def(d).rep()->reg.data.id >= 0) 48 WARN("part of vector result is unused !\n"); 49 return true; 50 } 51 52 if (op == OP_MOV || op == OP_UNION) { 53 if (!getDef(0)->equals(getSrc(0))) 54 return false; 55 if (op == OP_UNION) 56 if (!def(0).rep()->equals(getSrc(1))) 57 return false; 58 return true; 59 } 60 61 return false; 62 } 63 64 bool Instruction::isDead() const 65 { 66 if (op == OP_STORE || 67 op == OP_EXPORT || 68 op == OP_ATOM || 69 op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB || 70 op == OP_WRSV) 71 return false; 72 73 for (int d = 0; defExists(d); ++d) 74 if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0) 75 return false; 76 77 if (terminator || asFlow()) 78 return false; 79 if (fixed) 80 return false; 81 82 return true; 83 }; 84 85 // ============================================================================= 86 87 class CopyPropagation : public Pass 88 { 89 private: 90 virtual bool visit(BasicBlock *); 91 }; 92 93 // Propagate all MOVs forward to make subsequent optimization easier, except if 94 // the sources stem from a phi, in which case we don't want to mess up potential 95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def. 96 bool 97 CopyPropagation::visit(BasicBlock *bb) 98 { 99 Instruction *mov, *si, *next; 100 101 for (mov = bb->getEntry(); mov; mov = next) { 102 next = mov->next; 103 if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue()) 104 continue; 105 if (mov->getPredicate()) 106 continue; 107 if (mov->def(0).getFile() != mov->src(0).getFile()) 108 continue; 109 si = mov->getSrc(0)->getInsn(); 110 if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) { 111 // propagate 112 mov->def(0).replace(mov->getSrc(0), false); 113 delete_Instruction(prog, mov); 114 } 115 } 116 return true; 117 } 118 119 // ============================================================================= 120 121 class MergeSplits : public Pass 122 { 123 private: 124 virtual bool visit(BasicBlock *); 125 }; 126 127 // For SPLIT / MERGE pairs that operate on the same registers, replace the 128 // post-merge def with the SPLIT's source. 129 bool 130 MergeSplits::visit(BasicBlock *bb) 131 { 132 Instruction *i, *next, *si; 133 134 for (i = bb->getEntry(); i; i = next) { 135 next = i->next; 136 if (i->op != OP_MERGE || typeSizeof(i->dType) != 8) 137 continue; 138 si = i->getSrc(0)->getInsn(); 139 if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn()) 140 continue; 141 i->def(0).replace(si->getSrc(0), false); 142 delete_Instruction(prog, i); 143 } 144 145 return true; 146 } 147 148 // ============================================================================= 149 150 class LoadPropagation : public Pass 151 { 152 private: 153 virtual bool visit(BasicBlock *); 154 155 void checkSwapSrc01(Instruction *); 156 157 bool isCSpaceLoad(Instruction *); 158 bool isImmdLoad(Instruction *); 159 bool isAttribOrSharedLoad(Instruction *); 160 }; 161 162 bool 163 LoadPropagation::isCSpaceLoad(Instruction *ld) 164 { 165 return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST; 166 } 167 168 bool 169 LoadPropagation::isImmdLoad(Instruction *ld) 170 { 171 if (!ld || (ld->op != OP_MOV) || 172 ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8))) 173 return false; 174 175 // A 0 can be replaced with a register, so it doesn't count as an immediate. 176 ImmediateValue val; 177 return ld->src(0).getImmediate(val) && !val.isInteger(0); 178 } 179 180 bool 181 LoadPropagation::isAttribOrSharedLoad(Instruction *ld) 182 { 183 return ld && 184 (ld->op == OP_VFETCH || 185 (ld->op == OP_LOAD && 186 (ld->src(0).getFile() == FILE_SHADER_INPUT || 187 ld->src(0).getFile() == FILE_MEMORY_SHARED))); 188 } 189 190 void 191 LoadPropagation::checkSwapSrc01(Instruction *insn) 192 { 193 const Target *targ = prog->getTarget(); 194 if (!targ->getOpInfo(insn).commutative) 195 if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB) 196 return; 197 if (insn->src(1).getFile() != FILE_GPR) 198 return; 199 // This is the special OP_SET used for alphatesting, we can't reverse its 200 // arguments as that will confuse the fixup code. 201 if (insn->op == OP_SET && insn->subOp) 202 return; 203 204 Instruction *i0 = insn->getSrc(0)->getInsn(); 205 Instruction *i1 = insn->getSrc(1)->getInsn(); 206 207 // Swap sources to inline the less frequently used source. That way, 208 // optimistically, it will eventually be able to remove the instruction. 209 int i0refs = insn->getSrc(0)->refCount(); 210 int i1refs = insn->getSrc(1)->refCount(); 211 212 if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) { 213 if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) || 214 !targ->insnCanLoad(insn, 1, i1) || 215 i0refs < i1refs) 216 insn->swapSources(0, 1); 217 else 218 return; 219 } else 220 if (isAttribOrSharedLoad(i1)) { 221 if (!isAttribOrSharedLoad(i0)) 222 insn->swapSources(0, 1); 223 else 224 return; 225 } else { 226 return; 227 } 228 229 if (insn->op == OP_SET || insn->op == OP_SET_AND || 230 insn->op == OP_SET_OR || insn->op == OP_SET_XOR) 231 insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond); 232 else 233 if (insn->op == OP_SLCT) 234 insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond); 235 else 236 if (insn->op == OP_SUB) { 237 insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG); 238 insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG); 239 } 240 } 241 242 bool 243 LoadPropagation::visit(BasicBlock *bb) 244 { 245 const Target *targ = prog->getTarget(); 246 Instruction *next; 247 248 for (Instruction *i = bb->getEntry(); i; i = next) { 249 next = i->next; 250 251 if (i->op == OP_CALL) // calls have args as sources, they must be in regs 252 continue; 253 254 if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg 255 continue; 256 257 if (i->srcExists(1)) 258 checkSwapSrc01(i); 259 260 for (int s = 0; i->srcExists(s); ++s) { 261 Instruction *ld = i->getSrc(s)->getInsn(); 262 263 if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV)) 264 continue; 265 if (!targ->insnCanLoad(i, s, ld)) 266 continue; 267 268 // propagate ! 269 i->setSrc(s, ld->getSrc(0)); 270 if (ld->src(0).isIndirect(0)) 271 i->setIndirect(s, 0, ld->getIndirect(0, 0)); 272 273 if (ld->getDef(0)->refCount() == 0) 274 delete_Instruction(prog, ld); 275 } 276 } 277 return true; 278 } 279 280 // ============================================================================= 281 282 class IndirectPropagation : public Pass 283 { 284 private: 285 virtual bool visit(BasicBlock *); 286 }; 287 288 bool 289 IndirectPropagation::visit(BasicBlock *bb) 290 { 291 const Target *targ = prog->getTarget(); 292 Instruction *next; 293 294 for (Instruction *i = bb->getEntry(); i; i = next) { 295 next = i->next; 296 297 for (int s = 0; i->srcExists(s); ++s) { 298 Instruction *insn; 299 ImmediateValue imm; 300 if (!i->src(s).isIndirect(0)) 301 continue; 302 insn = i->getIndirect(s, 0)->getInsn(); 303 if (!insn) 304 continue; 305 if (insn->op == OP_ADD && !isFloatType(insn->dType)) { 306 if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) || 307 !insn->src(1).getImmediate(imm) || 308 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32)) 309 continue; 310 i->setIndirect(s, 0, insn->getSrc(0)); 311 i->setSrc(s, cloneShallow(func, i->getSrc(s))); 312 i->src(s).get()->reg.data.offset += imm.reg.data.u32; 313 } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) { 314 if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) || 315 !insn->src(1).getImmediate(imm) || 316 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32)) 317 continue; 318 i->setIndirect(s, 0, insn->getSrc(0)); 319 i->setSrc(s, cloneShallow(func, i->getSrc(s))); 320 i->src(s).get()->reg.data.offset -= imm.reg.data.u32; 321 } else if (insn->op == OP_MOV) { 322 if (!insn->src(0).getImmediate(imm) || 323 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32)) 324 continue; 325 i->setIndirect(s, 0, NULL); 326 i->setSrc(s, cloneShallow(func, i->getSrc(s))); 327 i->src(s).get()->reg.data.offset += imm.reg.data.u32; 328 } 329 } 330 } 331 return true; 332 } 333 334 // ============================================================================= 335 336 // Evaluate constant expressions. 337 class ConstantFolding : public Pass 338 { 339 public: 340 bool foldAll(Program *); 341 342 private: 343 virtual bool visit(BasicBlock *); 344 345 void expr(Instruction *, ImmediateValue&, ImmediateValue&); 346 void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&); 347 void opnd(Instruction *, ImmediateValue&, int s); 348 void opnd3(Instruction *, ImmediateValue&); 349 350 void unary(Instruction *, const ImmediateValue&); 351 352 void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&); 353 354 CmpInstruction *findOriginForTestWithZero(Value *); 355 356 unsigned int foldCount; 357 358 BuildUtil bld; 359 }; 360 361 // TODO: remember generated immediates and only revisit these 362 bool 363 ConstantFolding::foldAll(Program *prog) 364 { 365 unsigned int iterCount = 0; 366 do { 367 foldCount = 0; 368 if (!run(prog)) 369 return false; 370 } while (foldCount && ++iterCount < 2); 371 return true; 372 } 373 374 bool 375 ConstantFolding::visit(BasicBlock *bb) 376 { 377 Instruction *i, *next; 378 379 for (i = bb->getEntry(); i; i = next) { 380 next = i->next; 381 if (i->op == OP_MOV || i->op == OP_CALL) 382 continue; 383 384 ImmediateValue src0, src1, src2; 385 386 if (i->srcExists(2) && 387 i->src(0).getImmediate(src0) && 388 i->src(1).getImmediate(src1) && 389 i->src(2).getImmediate(src2)) 390 expr(i, src0, src1, src2); 391 else 392 if (i->srcExists(1) && 393 i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1)) 394 expr(i, src0, src1); 395 else 396 if (i->srcExists(0) && i->src(0).getImmediate(src0)) 397 opnd(i, src0, 0); 398 else 399 if (i->srcExists(1) && i->src(1).getImmediate(src1)) 400 opnd(i, src1, 1); 401 if (i->srcExists(2) && i->src(2).getImmediate(src2)) 402 opnd3(i, src2); 403 } 404 return true; 405 } 406 407 CmpInstruction * 408 ConstantFolding::findOriginForTestWithZero(Value *value) 409 { 410 if (!value) 411 return NULL; 412 Instruction *insn = value->getInsn(); 413 414 if (insn->asCmp() && insn->op != OP_SLCT) 415 return insn->asCmp(); 416 417 /* Sometimes mov's will sneak in as a result of other folding. This gets 418 * cleaned up later. 419 */ 420 if (insn->op == OP_MOV) 421 return findOriginForTestWithZero(insn->getSrc(0)); 422 423 /* Deal with AND 1.0 here since nv50 can't fold into boolean float */ 424 if (insn->op == OP_AND) { 425 int s = 0; 426 ImmediateValue imm; 427 if (!insn->src(s).getImmediate(imm)) { 428 s = 1; 429 if (!insn->src(s).getImmediate(imm)) 430 return NULL; 431 } 432 if (imm.reg.data.f32 != 1.0f) 433 return NULL; 434 /* TODO: Come up with a way to handle the condition being inverted */ 435 if (insn->src(!s).mod != Modifier(0)) 436 return NULL; 437 return findOriginForTestWithZero(insn->getSrc(!s)); 438 } 439 440 return NULL; 441 } 442 443 void 444 Modifier::applyTo(ImmediateValue& imm) const 445 { 446 if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128) 447 return; 448 switch (imm.reg.type) { 449 case TYPE_F32: 450 if (bits & NV50_IR_MOD_ABS) 451 imm.reg.data.f32 = fabsf(imm.reg.data.f32); 452 if (bits & NV50_IR_MOD_NEG) 453 imm.reg.data.f32 = -imm.reg.data.f32; 454 if (bits & NV50_IR_MOD_SAT) { 455 if (imm.reg.data.f32 < 0.0f) 456 imm.reg.data.f32 = 0.0f; 457 else 458 if (imm.reg.data.f32 > 1.0f) 459 imm.reg.data.f32 = 1.0f; 460 } 461 assert(!(bits & NV50_IR_MOD_NOT)); 462 break; 463 464 case TYPE_S8: // NOTE: will be extended 465 case TYPE_S16: 466 case TYPE_S32: 467 case TYPE_U8: // NOTE: treated as signed 468 case TYPE_U16: 469 case TYPE_U32: 470 if (bits & NV50_IR_MOD_ABS) 471 imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ? 472 imm.reg.data.s32 : -imm.reg.data.s32; 473 if (bits & NV50_IR_MOD_NEG) 474 imm.reg.data.s32 = -imm.reg.data.s32; 475 if (bits & NV50_IR_MOD_NOT) 476 imm.reg.data.s32 = ~imm.reg.data.s32; 477 break; 478 479 case TYPE_F64: 480 if (bits & NV50_IR_MOD_ABS) 481 imm.reg.data.f64 = fabs(imm.reg.data.f64); 482 if (bits & NV50_IR_MOD_NEG) 483 imm.reg.data.f64 = -imm.reg.data.f64; 484 if (bits & NV50_IR_MOD_SAT) { 485 if (imm.reg.data.f64 < 0.0) 486 imm.reg.data.f64 = 0.0; 487 else 488 if (imm.reg.data.f64 > 1.0) 489 imm.reg.data.f64 = 1.0; 490 } 491 assert(!(bits & NV50_IR_MOD_NOT)); 492 break; 493 494 default: 495 assert(!"invalid/unhandled type"); 496 imm.reg.data.u64 = 0; 497 break; 498 } 499 } 500 501 operation 502 Modifier::getOp() const 503 { 504 switch (bits) { 505 case NV50_IR_MOD_ABS: return OP_ABS; 506 case NV50_IR_MOD_NEG: return OP_NEG; 507 case NV50_IR_MOD_SAT: return OP_SAT; 508 case NV50_IR_MOD_NOT: return OP_NOT; 509 case 0: 510 return OP_MOV; 511 default: 512 return OP_CVT; 513 } 514 } 515 516 void 517 ConstantFolding::expr(Instruction *i, 518 ImmediateValue &imm0, ImmediateValue &imm1) 519 { 520 struct Storage *const a = &imm0.reg, *const b = &imm1.reg; 521 struct Storage res; 522 DataType type = i->dType; 523 524 memset(&res.data, 0, sizeof(res.data)); 525 526 switch (i->op) { 527 case OP_MAD: 528 case OP_FMA: 529 case OP_MUL: 530 if (i->dnz && i->dType == TYPE_F32) { 531 if (!isfinite(a->data.f32)) 532 a->data.f32 = 0.0f; 533 if (!isfinite(b->data.f32)) 534 b->data.f32 = 0.0f; 535 } 536 switch (i->dType) { 537 case TYPE_F32: 538 res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor); 539 break; 540 case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break; 541 case TYPE_S32: 542 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) { 543 res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32; 544 break; 545 } 546 /* fallthrough */ 547 case TYPE_U32: 548 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) { 549 res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32; 550 break; 551 } 552 res.data.u32 = a->data.u32 * b->data.u32; break; 553 default: 554 return; 555 } 556 break; 557 case OP_DIV: 558 if (b->data.u32 == 0) 559 break; 560 switch (i->dType) { 561 case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break; 562 case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break; 563 case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break; 564 case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break; 565 default: 566 return; 567 } 568 break; 569 case OP_ADD: 570 switch (i->dType) { 571 case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break; 572 case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break; 573 case TYPE_S32: 574 case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break; 575 default: 576 return; 577 } 578 break; 579 case OP_SUB: 580 switch (i->dType) { 581 case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break; 582 case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break; 583 case TYPE_S32: 584 case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break; 585 default: 586 return; 587 } 588 break; 589 case OP_POW: 590 switch (i->dType) { 591 case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break; 592 case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break; 593 default: 594 return; 595 } 596 break; 597 case OP_MAX: 598 switch (i->dType) { 599 case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break; 600 case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break; 601 case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break; 602 case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break; 603 default: 604 return; 605 } 606 break; 607 case OP_MIN: 608 switch (i->dType) { 609 case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break; 610 case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break; 611 case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break; 612 case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break; 613 default: 614 return; 615 } 616 break; 617 case OP_AND: 618 res.data.u64 = a->data.u64 & b->data.u64; 619 break; 620 case OP_OR: 621 res.data.u64 = a->data.u64 | b->data.u64; 622 break; 623 case OP_XOR: 624 res.data.u64 = a->data.u64 ^ b->data.u64; 625 break; 626 case OP_SHL: 627 res.data.u32 = a->data.u32 << b->data.u32; 628 break; 629 case OP_SHR: 630 switch (i->dType) { 631 case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break; 632 case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break; 633 default: 634 return; 635 } 636 break; 637 case OP_SLCT: 638 if (a->data.u32 != b->data.u32) 639 return; 640 res.data.u32 = a->data.u32; 641 break; 642 case OP_EXTBF: { 643 int offset = b->data.u32 & 0xff; 644 int width = (b->data.u32 >> 8) & 0xff; 645 int rshift = offset; 646 int lshift = 0; 647 if (width == 0) { 648 res.data.u32 = 0; 649 break; 650 } 651 if (width + offset < 32) { 652 rshift = 32 - width; 653 lshift = 32 - width - offset; 654 } 655 if (i->subOp == NV50_IR_SUBOP_EXTBF_REV) 656 res.data.u32 = util_bitreverse(a->data.u32); 657 else 658 res.data.u32 = a->data.u32; 659 switch (i->dType) { 660 case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break; 661 case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break; 662 default: 663 return; 664 } 665 break; 666 } 667 case OP_POPCNT: 668 res.data.u32 = util_bitcount(a->data.u32 & b->data.u32); 669 break; 670 case OP_PFETCH: 671 // The two arguments to pfetch are logically added together. Normally 672 // the second argument will not be constant, but that can happen. 673 res.data.u32 = a->data.u32 + b->data.u32; 674 type = TYPE_U32; 675 break; 676 case OP_MERGE: 677 switch (i->dType) { 678 case TYPE_U64: 679 case TYPE_S64: 680 case TYPE_F64: 681 res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32; 682 break; 683 default: 684 return; 685 } 686 break; 687 default: 688 return; 689 } 690 ++foldCount; 691 692 i->src(0).mod = Modifier(0); 693 i->src(1).mod = Modifier(0); 694 i->postFactor = 0; 695 696 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32)); 697 i->setSrc(1, NULL); 698 699 i->getSrc(0)->reg.data = res.data; 700 i->getSrc(0)->reg.type = type; 701 i->getSrc(0)->reg.size = typeSizeof(type); 702 703 switch (i->op) { 704 case OP_MAD: 705 case OP_FMA: { 706 ImmediateValue src0, src1 = *i->getSrc(0)->asImm(); 707 708 // Move the immediate into position 1, where we know it might be 709 // emittable. However it might not be anyways, as there may be other 710 // restrictions, so move it into a separate LValue. 711 bld.setPosition(i, false); 712 i->op = OP_ADD; 713 i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0)); 714 i->setSrc(0, i->getSrc(2)); 715 i->src(0).mod = i->src(2).mod; 716 i->setSrc(2, NULL); 717 718 if (i->src(0).getImmediate(src0)) 719 expr(i, src0, src1); 720 else 721 opnd(i, src1, 1); 722 break; 723 } 724 case OP_PFETCH: 725 // Leave PFETCH alone... we just folded its 2 args into 1. 726 break; 727 default: 728 i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */ 729 break; 730 } 731 i->subOp = 0; 732 } 733 734 void 735 ConstantFolding::expr(Instruction *i, 736 ImmediateValue &imm0, 737 ImmediateValue &imm1, 738 ImmediateValue &imm2) 739 { 740 struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg; 741 struct Storage res; 742 743 memset(&res.data, 0, sizeof(res.data)); 744 745 switch (i->op) { 746 case OP_INSBF: { 747 int offset = b->data.u32 & 0xff; 748 int width = (b->data.u32 >> 8) & 0xff; 749 unsigned bitmask = ((1 << width) - 1) << offset; 750 res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask); 751 break; 752 } 753 case OP_MAD: 754 case OP_FMA: { 755 switch (i->dType) { 756 case TYPE_F32: 757 res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) + 758 c->data.f32; 759 break; 760 case TYPE_F64: 761 res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64; 762 break; 763 case TYPE_S32: 764 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) { 765 res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32; 766 break; 767 } 768 /* fallthrough */ 769 case TYPE_U32: 770 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) { 771 res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32; 772 break; 773 } 774 res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32; 775 break; 776 default: 777 return; 778 } 779 break; 780 } 781 case OP_SHLADD: 782 res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32; 783 break; 784 default: 785 return; 786 } 787 788 ++foldCount; 789 i->src(0).mod = Modifier(0); 790 i->src(1).mod = Modifier(0); 791 i->src(2).mod = Modifier(0); 792 793 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32)); 794 i->setSrc(1, NULL); 795 i->setSrc(2, NULL); 796 797 i->getSrc(0)->reg.data = res.data; 798 i->getSrc(0)->reg.type = i->dType; 799 i->getSrc(0)->reg.size = typeSizeof(i->dType); 800 801 i->op = OP_MOV; 802 } 803 804 void 805 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm) 806 { 807 Storage res; 808 809 if (i->dType != TYPE_F32) 810 return; 811 switch (i->op) { 812 case OP_NEG: res.data.f32 = -imm.reg.data.f32; break; 813 case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break; 814 case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break; 815 case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break; 816 case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break; 817 case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break; 818 case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break; 819 case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break; 820 case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break; 821 case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break; 822 case OP_PRESIN: 823 case OP_PREEX2: 824 // these should be handled in subsequent OP_SIN/COS/EX2 825 res.data.f32 = imm.reg.data.f32; 826 break; 827 default: 828 return; 829 } 830 i->op = OP_MOV; 831 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32)); 832 i->src(0).mod = Modifier(0); 833 } 834 835 void 836 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2, 837 const int s, ImmediateValue& imm2) 838 { 839 const int t = s ? 0 : 1; 840 Instruction *insn; 841 Instruction *mul1 = NULL; // mul1 before mul2 842 int e = 0; 843 float f = imm2.reg.data.f32 * exp2f(mul2->postFactor); 844 ImmediateValue imm1; 845 846 assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32); 847 848 if (mul2->getSrc(t)->refCount() == 1) { 849 insn = mul2->getSrc(t)->getInsn(); 850 if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32) 851 mul1 = insn; 852 if (mul1 && !mul1->saturate) { 853 int s1; 854 855 if (mul1->src(s1 = 0).getImmediate(imm1) || 856 mul1->src(s1 = 1).getImmediate(imm1)) { 857 bld.setPosition(mul1, false); 858 // a = mul r, imm1 859 // d = mul a, imm2 -> d = mul r, (imm1 * imm2) 860 mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32)); 861 mul1->src(s1).mod = Modifier(0); 862 mul2->def(0).replace(mul1->getDef(0), false); 863 mul1->saturate = mul2->saturate; 864 } else 865 if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) { 866 // c = mul a, b 867 // d = mul c, imm -> d = mul_x_imm a, b 868 mul1->postFactor = e; 869 mul2->def(0).replace(mul1->getDef(0), false); 870 if (f < 0) 871 mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG); 872 mul1->saturate = mul2->saturate; 873 } 874 return; 875 } 876 } 877 if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) { 878 // b = mul a, imm 879 // d = mul b, c -> d = mul_x_imm a, c 880 int s2, t2; 881 insn = (*mul2->getDef(0)->uses.begin())->getInsn(); 882 if (!insn) 883 return; 884 mul1 = mul2; 885 mul2 = NULL; 886 s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1; 887 t2 = s2 ? 0 : 1; 888 if (insn->op == OP_MUL && insn->dType == TYPE_F32) 889 if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1)) 890 mul2 = insn; 891 if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) { 892 mul2->postFactor = e; 893 mul2->setSrc(s2, mul1->src(t)); 894 if (f < 0) 895 mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG); 896 } 897 } 898 } 899 900 void 901 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2) 902 { 903 switch (i->op) { 904 case OP_MAD: 905 case OP_FMA: 906 if (imm2.isInteger(0)) { 907 i->op = OP_MUL; 908 i->setSrc(2, NULL); 909 foldCount++; 910 return; 911 } 912 break; 913 case OP_SHLADD: 914 if (imm2.isInteger(0)) { 915 i->op = OP_SHL; 916 i->setSrc(2, NULL); 917 foldCount++; 918 return; 919 } 920 break; 921 default: 922 return; 923 } 924 } 925 926 void 927 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) 928 { 929 const Target *target = prog->getTarget(); 930 const int t = !s; 931 const operation op = i->op; 932 Instruction *newi = i; 933 934 switch (i->op) { 935 case OP_SPLIT: { 936 bld.setPosition(i, false); 937 938 uint8_t size = i->getDef(0)->reg.size; 939 uint32_t mask = (1ULL << size) - 1; 940 assert(size <= 32); 941 942 uint64_t val = imm0.reg.data.u64; 943 for (int8_t d = 0; i->defExists(d); ++d) { 944 Value *def = i->getDef(d); 945 assert(def->reg.size == size); 946 947 newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32); 948 val >>= size; 949 } 950 delete_Instruction(prog, i); 951 break; 952 } 953 case OP_MUL: 954 if (i->dType == TYPE_F32) 955 tryCollapseChainedMULs(i, s, imm0); 956 957 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) { 958 assert(!isFloatType(i->sType)); 959 if (imm0.isInteger(1) && i->dType == TYPE_S32) { 960 bld.setPosition(i, false); 961 // Need to set to the sign value, which is a compare. 962 newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0), 963 TYPE_S32, i->getSrc(t), bld.mkImm(0)); 964 delete_Instruction(prog, i); 965 } else if (imm0.isInteger(0) || imm0.isInteger(1)) { 966 // The high bits can't be set in this case (either mul by 0 or 967 // unsigned by 1) 968 i->op = OP_MOV; 969 i->subOp = 0; 970 i->setSrc(0, new_ImmediateValue(prog, 0u)); 971 i->src(0).mod = Modifier(0); 972 i->setSrc(1, NULL); 973 } else if (!imm0.isNegative() && imm0.isPow2()) { 974 // Translate into a shift 975 imm0.applyLog2(); 976 i->op = OP_SHR; 977 i->subOp = 0; 978 imm0.reg.data.u32 = 32 - imm0.reg.data.u32; 979 i->setSrc(0, i->getSrc(t)); 980 i->src(0).mod = i->src(t).mod; 981 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); 982 i->src(1).mod = 0; 983 } 984 } else 985 if (imm0.isInteger(0)) { 986 i->op = OP_MOV; 987 i->setSrc(0, new_ImmediateValue(prog, 0u)); 988 i->src(0).mod = Modifier(0); 989 i->postFactor = 0; 990 i->setSrc(1, NULL); 991 } else 992 if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) { 993 if (imm0.isNegative()) 994 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG); 995 i->op = i->src(t).mod.getOp(); 996 if (s == 0) { 997 i->setSrc(0, i->getSrc(1)); 998 i->src(0).mod = i->src(1).mod; 999 i->src(1).mod = 0; 1000 } 1001 if (i->op != OP_CVT) 1002 i->src(0).mod = 0; 1003 i->setSrc(1, NULL); 1004 } else 1005 if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) { 1006 if (imm0.isNegative()) 1007 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG); 1008 i->op = OP_ADD; 1009 i->setSrc(s, i->getSrc(t)); 1010 i->src(s).mod = i->src(t).mod; 1011 } else 1012 if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) { 1013 i->op = OP_SHL; 1014 imm0.applyLog2(); 1015 i->setSrc(0, i->getSrc(t)); 1016 i->src(0).mod = i->src(t).mod; 1017 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); 1018 i->src(1).mod = 0; 1019 } else 1020 if (i->postFactor && i->sType == TYPE_F32) { 1021 /* Can't emit a postfactor with an immediate, have to fold it in */ 1022 i->setSrc(s, new_ImmediateValue( 1023 prog, imm0.reg.data.f32 * exp2f(i->postFactor))); 1024 i->postFactor = 0; 1025 } 1026 break; 1027 case OP_MAD: 1028 if (imm0.isInteger(0)) { 1029 i->setSrc(0, i->getSrc(2)); 1030 i->src(0).mod = i->src(2).mod; 1031 i->setSrc(1, NULL); 1032 i->setSrc(2, NULL); 1033 i->op = i->src(0).mod.getOp(); 1034 if (i->op != OP_CVT) 1035 i->src(0).mod = 0; 1036 } else 1037 if (i->subOp != NV50_IR_SUBOP_MUL_HIGH && 1038 (imm0.isInteger(1) || imm0.isInteger(-1))) { 1039 if (imm0.isNegative()) 1040 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG); 1041 if (s == 0) { 1042 i->setSrc(0, i->getSrc(1)); 1043 i->src(0).mod = i->src(1).mod; 1044 } 1045 i->setSrc(1, i->getSrc(2)); 1046 i->src(1).mod = i->src(2).mod; 1047 i->setSrc(2, NULL); 1048 i->op = OP_ADD; 1049 } else 1050 if (s == 1 && !imm0.isNegative() && imm0.isPow2() && 1051 target->isOpSupported(OP_SHLADD, i->dType)) { 1052 i->op = OP_SHLADD; 1053 imm0.applyLog2(); 1054 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); 1055 } 1056 break; 1057 case OP_ADD: 1058 case OP_SUB: 1059 if (i->usesFlags()) 1060 break; 1061 if (imm0.isInteger(0)) { 1062 if (s == 0) { 1063 i->setSrc(0, i->getSrc(1)); 1064 i->src(0).mod = i->src(1).mod; 1065 if (i->op == OP_SUB) 1066 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG); 1067 } 1068 i->setSrc(1, NULL); 1069 i->op = i->src(0).mod.getOp(); 1070 if (i->op != OP_CVT) 1071 i->src(0).mod = Modifier(0); 1072 } 1073 break; 1074 1075 case OP_DIV: 1076 if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32)) 1077 break; 1078 bld.setPosition(i, false); 1079 if (imm0.reg.data.u32 == 0) { 1080 break; 1081 } else 1082 if (imm0.reg.data.u32 == 1) { 1083 i->op = OP_MOV; 1084 i->setSrc(1, NULL); 1085 } else 1086 if (i->dType == TYPE_U32 && imm0.isPow2()) { 1087 i->op = OP_SHR; 1088 i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32))); 1089 } else 1090 if (i->dType == TYPE_U32) { 1091 Instruction *mul; 1092 Value *tA, *tB; 1093 const uint32_t d = imm0.reg.data.u32; 1094 uint32_t m; 1095 int r, s; 1096 uint32_t l = util_logbase2(d); 1097 if (((uint32_t)1 << l) < d) 1098 ++l; 1099 m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1; 1100 r = l ? 1 : 0; 1101 s = l ? (l - 1) : 0; 1102 1103 tA = bld.getSSA(); 1104 tB = bld.getSSA(); 1105 mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0), 1106 bld.loadImm(NULL, m)); 1107 mul->subOp = NV50_IR_SUBOP_MUL_HIGH; 1108 bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA); 1109 tA = bld.getSSA(); 1110 if (r) 1111 bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r)); 1112 else 1113 tA = tB; 1114 tB = s ? bld.getSSA() : i->getDef(0); 1115 newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA); 1116 if (s) 1117 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s)); 1118 1119 delete_Instruction(prog, i); 1120 } else 1121 if (imm0.reg.data.s32 == -1) { 1122 i->op = OP_NEG; 1123 i->setSrc(1, NULL); 1124 } else { 1125 LValue *tA, *tB; 1126 LValue *tD; 1127 const int32_t d = imm0.reg.data.s32; 1128 int32_t m; 1129 int32_t l = util_logbase2(static_cast<unsigned>(abs(d))); 1130 if ((1 << l) < abs(d)) 1131 ++l; 1132 if (!l) 1133 l = 1; 1134 m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32); 1135 1136 tA = bld.getSSA(); 1137 tB = bld.getSSA(); 1138 bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m), 1139 i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH; 1140 if (l > 1) 1141 bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1)); 1142 else 1143 tB = tA; 1144 tA = bld.getSSA(); 1145 bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0)); 1146 tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue(); 1147 newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA); 1148 if (d < 0) 1149 bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB); 1150 1151 delete_Instruction(prog, i); 1152 } 1153 break; 1154 1155 case OP_MOD: 1156 if (i->sType == TYPE_U32 && imm0.isPow2()) { 1157 bld.setPosition(i, false); 1158 i->op = OP_AND; 1159 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1)); 1160 } 1161 break; 1162 1163 case OP_SET: // TODO: SET_AND,OR,XOR 1164 { 1165 /* This optimizes the case where the output of a set is being compared 1166 * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we 1167 * can be a lot cleverer in our comparison. 1168 */ 1169 CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t)); 1170 CondCode cc, ccZ; 1171 if (imm0.reg.data.u32 != 0 || !si) 1172 return; 1173 cc = si->setCond; 1174 ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U); 1175 // We do everything assuming var (cmp) 0, reverse the condition if 0 is 1176 // first. 1177 if (s == 0) 1178 ccZ = reverseCondCode(ccZ); 1179 // If there is a negative modifier, we need to undo that, by flipping 1180 // the comparison to zero. 1181 if (i->src(t).mod.neg()) 1182 ccZ = reverseCondCode(ccZ); 1183 // If this is a signed comparison, we expect the input to be a regular 1184 // boolean, i.e. 0/-1. However the rest of the logic assumes that true 1185 // is positive, so just flip the sign. 1186 if (i->sType == TYPE_S32) { 1187 assert(!isFloatType(si->dType)); 1188 ccZ = reverseCondCode(ccZ); 1189 } 1190 switch (ccZ) { 1191 case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true 1192 case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true 1193 case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool 1194 case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool 1195 case CC_GT: break; // bool > 0 -- bool 1196 case CC_NE: break; // bool != 0 -- bool 1197 default: 1198 return; 1199 } 1200 1201 // Update the condition of this SET to be identical to the origin set, 1202 // but with the updated condition code. The original SET should get 1203 // DCE'd, ideally. 1204 i->op = si->op; 1205 i->asCmp()->setCond = cc; 1206 i->setSrc(0, si->src(0)); 1207 i->setSrc(1, si->src(1)); 1208 if (si->srcExists(2)) 1209 i->setSrc(2, si->src(2)); 1210 i->sType = si->sType; 1211 } 1212 break; 1213 1214 case OP_AND: 1215 { 1216 Instruction *src = i->getSrc(t)->getInsn(); 1217 ImmediateValue imm1; 1218 if (imm0.reg.data.u32 == 0) { 1219 i->op = OP_MOV; 1220 i->setSrc(0, new_ImmediateValue(prog, 0u)); 1221 i->src(0).mod = Modifier(0); 1222 i->setSrc(1, NULL); 1223 } else if (imm0.reg.data.u32 == ~0U) { 1224 i->op = i->src(t).mod.getOp(); 1225 if (t) { 1226 i->setSrc(0, i->getSrc(t)); 1227 i->src(0).mod = i->src(t).mod; 1228 } 1229 i->setSrc(1, NULL); 1230 } else if (src->asCmp()) { 1231 CmpInstruction *cmp = src->asCmp(); 1232 if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1) 1233 return; 1234 if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32)) 1235 return; 1236 if (imm0.reg.data.f32 != 1.0) 1237 return; 1238 if (cmp->dType != TYPE_U32) 1239 return; 1240 1241 cmp->dType = TYPE_F32; 1242 if (i->src(t).mod != Modifier(0)) { 1243 assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT)); 1244 i->src(t).mod = Modifier(0); 1245 cmp->setCond = inverseCondCode(cmp->setCond); 1246 } 1247 i->op = OP_MOV; 1248 i->setSrc(s, NULL); 1249 if (t) { 1250 i->setSrc(0, i->getSrc(t)); 1251 i->setSrc(t, NULL); 1252 } 1253 } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) && 1254 src->op == OP_SHR && 1255 src->src(1).getImmediate(imm1) && 1256 i->src(t).mod == Modifier(0) && 1257 util_is_power_of_two(imm0.reg.data.u32 + 1)) { 1258 // low byte = offset, high byte = width 1259 uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32; 1260 i->op = OP_EXTBF; 1261 i->setSrc(0, src->getSrc(0)); 1262 i->setSrc(1, new_ImmediateValue(prog, ext)); 1263 } else if (src->op == OP_SHL && 1264 src->src(1).getImmediate(imm1) && 1265 i->src(t).mod == Modifier(0) && 1266 util_is_power_of_two(~imm0.reg.data.u32 + 1) && 1267 util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) { 1268 i->op = OP_MOV; 1269 i->setSrc(s, NULL); 1270 if (t) { 1271 i->setSrc(0, i->getSrc(t)); 1272 i->setSrc(t, NULL); 1273 } 1274 } 1275 } 1276 break; 1277 1278 case OP_SHL: 1279 { 1280 if (s != 1 || i->src(0).mod != Modifier(0)) 1281 break; 1282 // try to concatenate shifts 1283 Instruction *si = i->getSrc(0)->getInsn(); 1284 if (!si) 1285 break; 1286 ImmediateValue imm1; 1287 switch (si->op) { 1288 case OP_SHL: 1289 if (si->src(1).getImmediate(imm1)) { 1290 bld.setPosition(i, false); 1291 i->setSrc(0, si->getSrc(0)); 1292 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32)); 1293 } 1294 break; 1295 case OP_SHR: 1296 if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) { 1297 bld.setPosition(i, false); 1298 i->op = OP_AND; 1299 i->setSrc(0, si->getSrc(0)); 1300 i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1))); 1301 } 1302 break; 1303 case OP_MUL: 1304 int muls; 1305 if (isFloatType(si->dType)) 1306 return; 1307 if (si->src(1).getImmediate(imm1)) 1308 muls = 1; 1309 else if (si->src(0).getImmediate(imm1)) 1310 muls = 0; 1311 else 1312 return; 1313 1314 bld.setPosition(i, false); 1315 i->op = OP_MUL; 1316 i->setSrc(0, si->getSrc(!muls)); 1317 i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32)); 1318 break; 1319 case OP_SUB: 1320 case OP_ADD: 1321 int adds; 1322 if (isFloatType(si->dType)) 1323 return; 1324 if (si->op != OP_SUB && si->src(0).getImmediate(imm1)) 1325 adds = 0; 1326 else if (si->src(1).getImmediate(imm1)) 1327 adds = 1; 1328 else 1329 return; 1330 if (si->src(!adds).mod != Modifier(0)) 1331 return; 1332 // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z)) 1333 1334 // This is more operations, but if one of x, y is an immediate, then 1335 // we can get a situation where (a) we can use ISCADD, or (b) 1336 // propagate the add bit into an indirect load. 1337 bld.setPosition(i, false); 1338 i->op = si->op; 1339 i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32)); 1340 i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType, 1341 bld.getSSA(i->def(0).getSize(), i->def(0).getFile()), 1342 si->getSrc(!adds), 1343 bld.mkImm(imm0.reg.data.u32))); 1344 break; 1345 default: 1346 return; 1347 } 1348 } 1349 break; 1350 1351 case OP_ABS: 1352 case OP_NEG: 1353 case OP_SAT: 1354 case OP_LG2: 1355 case OP_RCP: 1356 case OP_SQRT: 1357 case OP_RSQ: 1358 case OP_PRESIN: 1359 case OP_SIN: 1360 case OP_COS: 1361 case OP_PREEX2: 1362 case OP_EX2: 1363 unary(i, imm0); 1364 break; 1365 case OP_BFIND: { 1366 int32_t res; 1367 switch (i->dType) { 1368 case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break; 1369 case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break; 1370 default: 1371 return; 1372 } 1373 if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0) 1374 res = 31 - res; 1375 bld.setPosition(i, false); /* make sure bld is init'ed */ 1376 i->setSrc(0, bld.mkImm(res)); 1377 i->setSrc(1, NULL); 1378 i->op = OP_MOV; 1379 i->subOp = 0; 1380 break; 1381 } 1382 case OP_POPCNT: { 1383 // Only deal with 1-arg POPCNT here 1384 if (i->srcExists(1)) 1385 break; 1386 uint32_t res = util_bitcount(imm0.reg.data.u32); 1387 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res)); 1388 i->setSrc(1, NULL); 1389 i->op = OP_MOV; 1390 break; 1391 } 1392 case OP_CVT: { 1393 Storage res; 1394 1395 // TODO: handle 64-bit values properly 1396 if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8) 1397 return; 1398 1399 // TODO: handle single byte/word extractions 1400 if (i->subOp) 1401 return; 1402 1403 bld.setPosition(i, true); /* make sure bld is init'ed */ 1404 1405 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \ 1406 case type: \ 1407 switch (i->sType) { \ 1408 case TYPE_F64: \ 1409 res.data.dst = util_iround(i->saturate ? \ 1410 CLAMP(imm0.reg.data.f64, fmin, fmax) : \ 1411 imm0.reg.data.f64); \ 1412 break; \ 1413 case TYPE_F32: \ 1414 res.data.dst = util_iround(i->saturate ? \ 1415 CLAMP(imm0.reg.data.f32, fmin, fmax) : \ 1416 imm0.reg.data.f32); \ 1417 break; \ 1418 case TYPE_S32: \ 1419 res.data.dst = i->saturate ? \ 1420 CLAMP(imm0.reg.data.s32, imin, imax) : \ 1421 imm0.reg.data.s32; \ 1422 break; \ 1423 case TYPE_U32: \ 1424 res.data.dst = i->saturate ? \ 1425 CLAMP(imm0.reg.data.u32, umin, umax) : \ 1426 imm0.reg.data.u32; \ 1427 break; \ 1428 case TYPE_S16: \ 1429 res.data.dst = i->saturate ? \ 1430 CLAMP(imm0.reg.data.s16, imin, imax) : \ 1431 imm0.reg.data.s16; \ 1432 break; \ 1433 case TYPE_U16: \ 1434 res.data.dst = i->saturate ? \ 1435 CLAMP(imm0.reg.data.u16, umin, umax) : \ 1436 imm0.reg.data.u16; \ 1437 break; \ 1438 default: return; \ 1439 } \ 1440 i->setSrc(0, bld.mkImm(res.data.dst)); \ 1441 break 1442 1443 switch(i->dType) { 1444 CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX); 1445 CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX); 1446 CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX); 1447 CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX); 1448 case TYPE_F32: 1449 switch (i->sType) { 1450 case TYPE_F64: 1451 res.data.f32 = i->saturate ? 1452 CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) : 1453 imm0.reg.data.f64; 1454 break; 1455 case TYPE_F32: 1456 res.data.f32 = i->saturate ? 1457 CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) : 1458 imm0.reg.data.f32; 1459 break; 1460 case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break; 1461 case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break; 1462 case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break; 1463 case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break; 1464 default: 1465 return; 1466 } 1467 i->setSrc(0, bld.mkImm(res.data.f32)); 1468 break; 1469 case TYPE_F64: 1470 switch (i->sType) { 1471 case TYPE_F64: 1472 res.data.f64 = i->saturate ? 1473 CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) : 1474 imm0.reg.data.f64; 1475 break; 1476 case TYPE_F32: 1477 res.data.f64 = i->saturate ? 1478 CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) : 1479 imm0.reg.data.f32; 1480 break; 1481 case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break; 1482 case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break; 1483 case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break; 1484 case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break; 1485 default: 1486 return; 1487 } 1488 i->setSrc(0, bld.mkImm(res.data.f64)); 1489 break; 1490 default: 1491 return; 1492 } 1493 #undef CASE 1494 1495 i->setType(i->dType); /* Remove i->sType, which we don't need anymore */ 1496 i->op = OP_MOV; 1497 i->saturate = 0; 1498 i->src(0).mod = Modifier(0); /* Clear the already applied modifier */ 1499 break; 1500 } 1501 default: 1502 return; 1503 } 1504 if (newi->op != op) 1505 foldCount++; 1506 } 1507 1508 // ============================================================================= 1509 1510 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed. 1511 class ModifierFolding : public Pass 1512 { 1513 private: 1514 virtual bool visit(BasicBlock *); 1515 }; 1516 1517 bool 1518 ModifierFolding::visit(BasicBlock *bb) 1519 { 1520 const Target *target = prog->getTarget(); 1521 1522 Instruction *i, *next, *mi; 1523 Modifier mod; 1524 1525 for (i = bb->getEntry(); i; i = next) { 1526 next = i->next; 1527 1528 if (0 && i->op == OP_SUB) { 1529 // turn "sub" into "add neg" (do we really want this ?) 1530 i->op = OP_ADD; 1531 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG); 1532 } 1533 1534 for (int s = 0; s < 3 && i->srcExists(s); ++s) { 1535 mi = i->getSrc(s)->getInsn(); 1536 if (!mi || 1537 mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8) 1538 continue; 1539 if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) { 1540 if ((i->op != OP_ADD && 1541 i->op != OP_MUL) || 1542 (mi->op != OP_ABS && 1543 mi->op != OP_NEG)) 1544 continue; 1545 } else 1546 if (i->sType != mi->dType) { 1547 continue; 1548 } 1549 if ((mod = Modifier(mi->op)) == Modifier(0)) 1550 continue; 1551 mod *= mi->src(0).mod; 1552 1553 if ((i->op == OP_ABS) || i->src(s).mod.abs()) { 1554 // abs neg [abs] = abs 1555 mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS)); 1556 } else 1557 if ((i->op == OP_NEG) && mod.neg()) { 1558 assert(s == 0); 1559 // neg as both opcode and modifier on same insn is prohibited 1560 // neg neg abs = abs, neg neg = identity 1561 mod = mod & Modifier(~NV50_IR_MOD_NEG); 1562 i->op = mod.getOp(); 1563 mod = mod & Modifier(~NV50_IR_MOD_ABS); 1564 if (mod == Modifier(0)) 1565 i->op = OP_MOV; 1566 } 1567 1568 if (target->isModSupported(i, s, mod)) { 1569 i->setSrc(s, mi->getSrc(0)); 1570 i->src(s).mod *= mod; 1571 } 1572 } 1573 1574 if (i->op == OP_SAT) { 1575 mi = i->getSrc(0)->getInsn(); 1576 if (mi && 1577 mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) { 1578 mi->saturate = 1; 1579 mi->setDef(0, i->getDef(0)); 1580 delete_Instruction(prog, i); 1581 } 1582 } 1583 } 1584 1585 return true; 1586 } 1587 1588 // ============================================================================= 1589 1590 // MUL + ADD -> MAD/FMA 1591 // MIN/MAX(a, a) -> a, etc. 1592 // SLCT(a, b, const) -> cc(const) ? a : b 1593 // RCP(RCP(a)) -> a 1594 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b) 1595 class AlgebraicOpt : public Pass 1596 { 1597 private: 1598 virtual bool visit(BasicBlock *); 1599 1600 void handleABS(Instruction *); 1601 bool handleADD(Instruction *); 1602 bool tryADDToMADOrSAD(Instruction *, operation toOp); 1603 void handleMINMAX(Instruction *); 1604 void handleRCP(Instruction *); 1605 void handleSLCT(Instruction *); 1606 void handleLOGOP(Instruction *); 1607 void handleCVT_NEG(Instruction *); 1608 void handleCVT_CVT(Instruction *); 1609 void handleCVT_EXTBF(Instruction *); 1610 void handleSUCLAMP(Instruction *); 1611 void handleNEG(Instruction *); 1612 1613 BuildUtil bld; 1614 }; 1615 1616 void 1617 AlgebraicOpt::handleABS(Instruction *abs) 1618 { 1619 Instruction *sub = abs->getSrc(0)->getInsn(); 1620 DataType ty; 1621 if (!sub || 1622 !prog->getTarget()->isOpSupported(OP_SAD, abs->dType)) 1623 return; 1624 // expect not to have mods yet, if we do, bail 1625 if (sub->src(0).mod || sub->src(1).mod) 1626 return; 1627 // hidden conversion ? 1628 ty = intTypeToSigned(sub->dType); 1629 if (abs->dType != abs->sType || ty != abs->sType) 1630 return; 1631 1632 if ((sub->op != OP_ADD && sub->op != OP_SUB) || 1633 sub->src(0).getFile() != FILE_GPR || sub->src(0).mod || 1634 sub->src(1).getFile() != FILE_GPR || sub->src(1).mod) 1635 return; 1636 1637 Value *src0 = sub->getSrc(0); 1638 Value *src1 = sub->getSrc(1); 1639 1640 if (sub->op == OP_ADD) { 1641 Instruction *neg = sub->getSrc(1)->getInsn(); 1642 if (neg && neg->op != OP_NEG) { 1643 neg = sub->getSrc(0)->getInsn(); 1644 src0 = sub->getSrc(1); 1645 } 1646 if (!neg || neg->op != OP_NEG || 1647 neg->dType != neg->sType || neg->sType != ty) 1648 return; 1649 src1 = neg->getSrc(0); 1650 } 1651 1652 // found ABS(SUB)) 1653 abs->moveSources(1, 2); // move sources >=1 up by 2 1654 abs->op = OP_SAD; 1655 abs->setType(sub->dType); 1656 abs->setSrc(0, src0); 1657 abs->setSrc(1, src1); 1658 bld.setPosition(abs, false); 1659 abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0)); 1660 } 1661 1662 bool 1663 AlgebraicOpt::handleADD(Instruction *add) 1664 { 1665 Value *src0 = add->getSrc(0); 1666 Value *src1 = add->getSrc(1); 1667 1668 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) 1669 return false; 1670 1671 bool changed = false; 1672 if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType)) 1673 changed = tryADDToMADOrSAD(add, OP_MAD); 1674 if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType)) 1675 changed = tryADDToMADOrSAD(add, OP_SAD); 1676 return changed; 1677 } 1678 1679 // ADD(SAD(a,b,0), c) -> SAD(a,b,c) 1680 // ADD(MUL(a,b), c) -> MAD(a,b,c) 1681 bool 1682 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp) 1683 { 1684 Value *src0 = add->getSrc(0); 1685 Value *src1 = add->getSrc(1); 1686 Value *src; 1687 int s; 1688 const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL; 1689 const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0)); 1690 Modifier mod[4]; 1691 1692 if (src0->refCount() == 1 && 1693 src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp) 1694 s = 0; 1695 else 1696 if (src1->refCount() == 1 && 1697 src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp) 1698 s = 1; 1699 else 1700 return false; 1701 1702 src = add->getSrc(s); 1703 1704 if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb) 1705 return false; 1706 1707 if (src->getInsn()->saturate || src->getInsn()->postFactor || 1708 src->getInsn()->dnz) 1709 return false; 1710 1711 if (toOp == OP_SAD) { 1712 ImmediateValue imm; 1713 if (!src->getInsn()->src(2).getImmediate(imm)) 1714 return false; 1715 if (!imm.isInteger(0)) 1716 return false; 1717 } 1718 1719 if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) || 1720 isFloatType(add->dType) != isFloatType(src->getInsn()->dType)) 1721 return false; 1722 1723 mod[0] = add->src(0).mod; 1724 mod[1] = add->src(1).mod; 1725 mod[2] = src->getUniqueInsn()->src(0).mod; 1726 mod[3] = src->getUniqueInsn()->src(1).mod; 1727 1728 if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad) 1729 return false; 1730 1731 add->op = toOp; 1732 add->subOp = src->getInsn()->subOp; // potentially mul-high 1733 add->dType = src->getInsn()->dType; // sign matters for imad hi 1734 add->sType = src->getInsn()->sType; 1735 1736 add->setSrc(2, add->src(s ? 0 : 1)); 1737 1738 add->setSrc(0, src->getInsn()->getSrc(0)); 1739 add->src(0).mod = mod[2] ^ mod[s]; 1740 add->setSrc(1, src->getInsn()->getSrc(1)); 1741 add->src(1).mod = mod[3]; 1742 1743 return true; 1744 } 1745 1746 void 1747 AlgebraicOpt::handleMINMAX(Instruction *minmax) 1748 { 1749 Value *src0 = minmax->getSrc(0); 1750 Value *src1 = minmax->getSrc(1); 1751 1752 if (src0 != src1 || src0->reg.file != FILE_GPR) 1753 return; 1754 if (minmax->src(0).mod == minmax->src(1).mod) { 1755 if (minmax->def(0).mayReplace(minmax->src(0))) { 1756 minmax->def(0).replace(minmax->src(0), false); 1757 minmax->bb->remove(minmax); 1758 } else { 1759 minmax->op = OP_CVT; 1760 minmax->setSrc(1, NULL); 1761 } 1762 } else { 1763 // TODO: 1764 // min(x, -x) = -abs(x) 1765 // min(x, -abs(x)) = -abs(x) 1766 // min(x, abs(x)) = x 1767 // max(x, -abs(x)) = x 1768 // max(x, abs(x)) = abs(x) 1769 // max(x, -x) = abs(x) 1770 } 1771 } 1772 1773 void 1774 AlgebraicOpt::handleRCP(Instruction *rcp) 1775 { 1776 Instruction *si = rcp->getSrc(0)->getUniqueInsn(); 1777 1778 if (si && si->op == OP_RCP) { 1779 Modifier mod = rcp->src(0).mod * si->src(0).mod; 1780 rcp->op = mod.getOp(); 1781 rcp->setSrc(0, si->getSrc(0)); 1782 } 1783 } 1784 1785 void 1786 AlgebraicOpt::handleSLCT(Instruction *slct) 1787 { 1788 if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) { 1789 if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f)) 1790 slct->setSrc(0, slct->getSrc(1)); 1791 } else 1792 if (slct->getSrc(0) != slct->getSrc(1)) { 1793 return; 1794 } 1795 slct->op = OP_MOV; 1796 slct->setSrc(1, NULL); 1797 slct->setSrc(2, NULL); 1798 } 1799 1800 void 1801 AlgebraicOpt::handleLOGOP(Instruction *logop) 1802 { 1803 Value *src0 = logop->getSrc(0); 1804 Value *src1 = logop->getSrc(1); 1805 1806 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) 1807 return; 1808 1809 if (src0 == src1) { 1810 if ((logop->op == OP_AND || logop->op == OP_OR) && 1811 logop->def(0).mayReplace(logop->src(0))) { 1812 logop->def(0).replace(logop->src(0), false); 1813 delete_Instruction(prog, logop); 1814 } 1815 } else { 1816 // try AND(SET, SET) -> SET_AND(SET) 1817 Instruction *set0 = src0->getInsn(); 1818 Instruction *set1 = src1->getInsn(); 1819 1820 if (!set0 || set0->fixed || !set1 || set1->fixed) 1821 return; 1822 if (set1->op != OP_SET) { 1823 Instruction *xchg = set0; 1824 set0 = set1; 1825 set1 = xchg; 1826 if (set1->op != OP_SET) 1827 return; 1828 } 1829 operation redOp = (logop->op == OP_AND ? OP_SET_AND : 1830 logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR); 1831 if (!prog->getTarget()->isOpSupported(redOp, set1->sType)) 1832 return; 1833 if (set0->op != OP_SET && 1834 set0->op != OP_SET_AND && 1835 set0->op != OP_SET_OR && 1836 set0->op != OP_SET_XOR) 1837 return; 1838 if (set0->getDef(0)->refCount() > 1 && 1839 set1->getDef(0)->refCount() > 1) 1840 return; 1841 if (set0->getPredicate() || set1->getPredicate()) 1842 return; 1843 // check that they don't source each other 1844 for (int s = 0; s < 2; ++s) 1845 if (set0->getSrc(s) == set1->getDef(0) || 1846 set1->getSrc(s) == set0->getDef(0)) 1847 return; 1848 1849 set0 = cloneForward(func, set0); 1850 set1 = cloneShallow(func, set1); 1851 logop->bb->insertAfter(logop, set1); 1852 logop->bb->insertAfter(logop, set0); 1853 1854 set0->dType = TYPE_U8; 1855 set0->getDef(0)->reg.file = FILE_PREDICATE; 1856 set0->getDef(0)->reg.size = 1; 1857 set1->setSrc(2, set0->getDef(0)); 1858 set1->op = redOp; 1859 set1->setDef(0, logop->getDef(0)); 1860 delete_Instruction(prog, logop); 1861 } 1862 } 1863 1864 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0 1865 // nv50: 1866 // F2I(NEG(I2F(ABS(SET)))) 1867 void 1868 AlgebraicOpt::handleCVT_NEG(Instruction *cvt) 1869 { 1870 Instruction *insn = cvt->getSrc(0)->getInsn(); 1871 if (cvt->sType != TYPE_F32 || 1872 cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0)) 1873 return; 1874 if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32) 1875 return; 1876 if (insn->src(0).mod != Modifier(0)) 1877 return; 1878 insn = insn->getSrc(0)->getInsn(); 1879 1880 // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET 1881 if (insn && insn->op == OP_CVT && 1882 insn->dType == TYPE_F32 && 1883 insn->sType == TYPE_S32) { 1884 insn = insn->getSrc(0)->getInsn(); 1885 if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 || 1886 insn->src(0).mod) 1887 return; 1888 insn = insn->getSrc(0)->getInsn(); 1889 if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32) 1890 return; 1891 } else 1892 if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) { 1893 return; 1894 } 1895 1896 Instruction *bset = cloneShallow(func, insn); 1897 bset->dType = TYPE_U32; 1898 bset->setDef(0, cvt->getDef(0)); 1899 cvt->bb->insertAfter(cvt, bset); 1900 delete_Instruction(prog, cvt); 1901 } 1902 1903 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT 1904 // does a type conversion, this becomes trickier as there might be range 1905 // changes/etc. We could handle those in theory as long as the range was being 1906 // reduced or kept the same. 1907 void 1908 AlgebraicOpt::handleCVT_CVT(Instruction *cvt) 1909 { 1910 Instruction *insn = cvt->getSrc(0)->getInsn(); 1911 RoundMode rnd = insn->rnd; 1912 1913 if (insn->saturate || 1914 insn->subOp || 1915 insn->dType != insn->sType || 1916 insn->dType != cvt->sType) 1917 return; 1918 1919 switch (insn->op) { 1920 case OP_CEIL: 1921 rnd = ROUND_PI; 1922 break; 1923 case OP_FLOOR: 1924 rnd = ROUND_MI; 1925 break; 1926 case OP_TRUNC: 1927 rnd = ROUND_ZI; 1928 break; 1929 case OP_CVT: 1930 break; 1931 default: 1932 return; 1933 } 1934 1935 if (!isFloatType(cvt->dType) || !isFloatType(insn->sType)) 1936 rnd = (RoundMode)(rnd & 3); 1937 1938 cvt->rnd = rnd; 1939 cvt->setSrc(0, insn->getSrc(0)); 1940 cvt->src(0).mod *= insn->src(0).mod; 1941 cvt->sType = insn->sType; 1942 } 1943 1944 // Some shaders extract packed bytes out of words and convert them to 1945 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can 1946 // nv50 for word sizes. 1947 // 1948 // CVT(EXTBF(x, byte/word)) 1949 // CVT(AND(bytemask, x)) 1950 // CVT(AND(bytemask, SHR(x, 8/16/24))) 1951 // CVT(SHR(x, 16/24)) 1952 void 1953 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt) 1954 { 1955 Instruction *insn = cvt->getSrc(0)->getInsn(); 1956 ImmediateValue imm; 1957 Value *arg = NULL; 1958 unsigned width, offset; 1959 if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn) 1960 return; 1961 if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) { 1962 width = (imm.reg.data.u32 >> 8) & 0xff; 1963 offset = imm.reg.data.u32 & 0xff; 1964 arg = insn->getSrc(0); 1965 1966 if (width != 8 && width != 16) 1967 return; 1968 if (width == 8 && offset & 0x7) 1969 return; 1970 if (width == 16 && offset & 0xf) 1971 return; 1972 } else if (insn->op == OP_AND) { 1973 int s; 1974 if (insn->src(0).getImmediate(imm)) 1975 s = 0; 1976 else if (insn->src(1).getImmediate(imm)) 1977 s = 1; 1978 else 1979 return; 1980 1981 if (imm.reg.data.u32 == 0xff) 1982 width = 8; 1983 else if (imm.reg.data.u32 == 0xffff) 1984 width = 16; 1985 else 1986 return; 1987 1988 arg = insn->getSrc(!s); 1989 Instruction *shift = arg->getInsn(); 1990 offset = 0; 1991 if (shift && shift->op == OP_SHR && 1992 shift->sType == cvt->sType && 1993 shift->src(1).getImmediate(imm) && 1994 ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) || 1995 (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) { 1996 arg = shift->getSrc(0); 1997 offset = imm.reg.data.u32; 1998 } 1999 // We just AND'd the high bits away, which means this is effectively an 2000 // unsigned value. 2001 cvt->sType = TYPE_U32; 2002 } else if (insn->op == OP_SHR && 2003 insn->sType == cvt->sType && 2004 insn->src(1).getImmediate(imm)) { 2005 arg = insn->getSrc(0); 2006 if (imm.reg.data.u32 == 24) { 2007 width = 8; 2008 offset = 24; 2009 } else if (imm.reg.data.u32 == 16) { 2010 width = 16; 2011 offset = 16; 2012 } else { 2013 return; 2014 } 2015 } 2016 2017 if (!arg) 2018 return; 2019 2020 // Irrespective of what came earlier, we can undo a shift on the argument 2021 // by adjusting the offset. 2022 Instruction *shift = arg->getInsn(); 2023 if (shift && shift->op == OP_SHL && 2024 shift->src(1).getImmediate(imm) && 2025 ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) || 2026 (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) && 2027 imm.reg.data.u32 <= offset) { 2028 arg = shift->getSrc(0); 2029 offset -= imm.reg.data.u32; 2030 } 2031 2032 // The unpackSnorm lowering still leaves a few shifts behind, but it's too 2033 // annoying to detect them. 2034 2035 if (width == 8) { 2036 cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8; 2037 } else { 2038 assert(width == 16); 2039 cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16; 2040 } 2041 cvt->setSrc(0, arg); 2042 cvt->subOp = offset >> 3; 2043 } 2044 2045 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6) 2046 void 2047 AlgebraicOpt::handleSUCLAMP(Instruction *insn) 2048 { 2049 ImmediateValue imm; 2050 int32_t val = insn->getSrc(2)->asImm()->reg.data.s32; 2051 int s; 2052 Instruction *add; 2053 2054 assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR); 2055 2056 // look for ADD (TODO: only count references by non-SUCLAMP) 2057 if (insn->getSrc(0)->refCount() > 1) 2058 return; 2059 add = insn->getSrc(0)->getInsn(); 2060 if (!add || add->op != OP_ADD || 2061 (add->dType != TYPE_U32 && 2062 add->dType != TYPE_S32)) 2063 return; 2064 2065 // look for immediate 2066 for (s = 0; s < 2; ++s) 2067 if (add->src(s).getImmediate(imm)) 2068 break; 2069 if (s >= 2) 2070 return; 2071 s = s ? 0 : 1; 2072 // determine if immediate fits 2073 val += imm.reg.data.s32; 2074 if (val > 31 || val < -32) 2075 return; 2076 // determine if other addend fits 2077 if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0)) 2078 return; 2079 2080 bld.setPosition(insn, false); // make sure bld is init'ed 2081 // replace sources 2082 insn->setSrc(2, bld.mkImm(val)); 2083 insn->setSrc(0, add->getSrc(s)); 2084 } 2085 2086 // NEG(AND(SET, 1)) -> SET 2087 void 2088 AlgebraicOpt::handleNEG(Instruction *i) { 2089 Instruction *src = i->getSrc(0)->getInsn(); 2090 ImmediateValue imm; 2091 int b; 2092 2093 if (isFloatType(i->sType) || !src || src->op != OP_AND) 2094 return; 2095 2096 if (src->src(0).getImmediate(imm)) 2097 b = 1; 2098 else if (src->src(1).getImmediate(imm)) 2099 b = 0; 2100 else 2101 return; 2102 2103 if (!imm.isInteger(1)) 2104 return; 2105 2106 Instruction *set = src->getSrc(b)->getInsn(); 2107 if ((set->op == OP_SET || set->op == OP_SET_AND || 2108 set->op == OP_SET_OR || set->op == OP_SET_XOR) && 2109 !isFloatType(set->dType)) { 2110 i->def(0).replace(set->getDef(0), false); 2111 } 2112 } 2113 2114 bool 2115 AlgebraicOpt::visit(BasicBlock *bb) 2116 { 2117 Instruction *next; 2118 for (Instruction *i = bb->getEntry(); i; i = next) { 2119 next = i->next; 2120 switch (i->op) { 2121 case OP_ABS: 2122 handleABS(i); 2123 break; 2124 case OP_ADD: 2125 handleADD(i); 2126 break; 2127 case OP_RCP: 2128 handleRCP(i); 2129 break; 2130 case OP_MIN: 2131 case OP_MAX: 2132 handleMINMAX(i); 2133 break; 2134 case OP_SLCT: 2135 handleSLCT(i); 2136 break; 2137 case OP_AND: 2138 case OP_OR: 2139 case OP_XOR: 2140 handleLOGOP(i); 2141 break; 2142 case OP_CVT: 2143 handleCVT_NEG(i); 2144 handleCVT_CVT(i); 2145 if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32)) 2146 handleCVT_EXTBF(i); 2147 break; 2148 case OP_SUCLAMP: 2149 handleSUCLAMP(i); 2150 break; 2151 case OP_NEG: 2152 handleNEG(i); 2153 break; 2154 default: 2155 break; 2156 } 2157 } 2158 2159 return true; 2160 } 2161 2162 // ============================================================================= 2163 2164 // ADD(SHL(a, b), c) -> SHLADD(a, b, c) 2165 class LateAlgebraicOpt : public Pass 2166 { 2167 private: 2168 virtual bool visit(Instruction *); 2169 2170 void handleADD(Instruction *); 2171 bool tryADDToSHLADD(Instruction *); 2172 }; 2173 2174 void 2175 LateAlgebraicOpt::handleADD(Instruction *add) 2176 { 2177 Value *src0 = add->getSrc(0); 2178 Value *src1 = add->getSrc(1); 2179 2180 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) 2181 return; 2182 2183 if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType)) 2184 tryADDToSHLADD(add); 2185 } 2186 2187 // ADD(SHL(a, b), c) -> SHLADD(a, b, c) 2188 bool 2189 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add) 2190 { 2191 Value *src0 = add->getSrc(0); 2192 Value *src1 = add->getSrc(1); 2193 ImmediateValue imm; 2194 Instruction *shl; 2195 Value *src; 2196 int s; 2197 2198 if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8 2199 || isFloatType(add->dType)) 2200 return false; 2201 2202 if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL) 2203 s = 0; 2204 else 2205 if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL) 2206 s = 1; 2207 else 2208 return false; 2209 2210 src = add->getSrc(s); 2211 shl = src->getUniqueInsn(); 2212 2213 if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod) 2214 return false; 2215 2216 if (!shl->src(1).getImmediate(imm)) 2217 return false; 2218 2219 add->op = OP_SHLADD; 2220 add->setSrc(2, add->src(!s)); 2221 // SHL can't have any modifiers, but the ADD source may have had 2222 // one. Preserve it. 2223 add->setSrc(0, shl->getSrc(0)); 2224 if (s == 1) 2225 add->src(0).mod = add->src(1).mod; 2226 add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32)); 2227 add->src(1).mod = Modifier(0); 2228 2229 return true; 2230 } 2231 2232 bool 2233 LateAlgebraicOpt::visit(Instruction *i) 2234 { 2235 switch (i->op) { 2236 case OP_ADD: 2237 handleADD(i); 2238 break; 2239 default: 2240 break; 2241 } 2242 2243 return true; 2244 } 2245 2246 // ============================================================================= 2247 2248 static inline void 2249 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn) 2250 { 2251 if (offset != ldst->getSrc(0)->reg.data.offset) { 2252 if (ldst->getSrc(0)->refCount() > 1) 2253 ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0))); 2254 ldst->getSrc(0)->reg.data.offset = offset; 2255 } 2256 } 2257 2258 // Combine loads and stores, forward stores to loads where possible. 2259 class MemoryOpt : public Pass 2260 { 2261 private: 2262 class Record 2263 { 2264 public: 2265 Record *next; 2266 Instruction *insn; 2267 const Value *rel[2]; 2268 const Value *base; 2269 int32_t offset; 2270 int8_t fileIndex; 2271 uint8_t size; 2272 bool locked; 2273 Record *prev; 2274 2275 bool overlaps(const Instruction *ldst) const; 2276 2277 inline void link(Record **); 2278 inline void unlink(Record **); 2279 inline void set(const Instruction *ldst); 2280 }; 2281 2282 public: 2283 MemoryOpt(); 2284 2285 Record *loads[DATA_FILE_COUNT]; 2286 Record *stores[DATA_FILE_COUNT]; 2287 2288 MemoryPool recordPool; 2289 2290 private: 2291 virtual bool visit(BasicBlock *); 2292 bool runOpt(BasicBlock *); 2293 2294 Record **getList(const Instruction *); 2295 2296 Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const; 2297 2298 // merge @insn into load/store instruction from @rec 2299 bool combineLd(Record *rec, Instruction *ld); 2300 bool combineSt(Record *rec, Instruction *st); 2301 2302 bool replaceLdFromLd(Instruction *ld, Record *ldRec); 2303 bool replaceLdFromSt(Instruction *ld, Record *stRec); 2304 bool replaceStFromSt(Instruction *restrict st, Record *stRec); 2305 2306 void addRecord(Instruction *ldst); 2307 void purgeRecords(Instruction *const st, DataFile); 2308 void lockStores(Instruction *const ld); 2309 void reset(); 2310 2311 private: 2312 Record *prevRecord; 2313 }; 2314 2315 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6) 2316 { 2317 for (int i = 0; i < DATA_FILE_COUNT; ++i) { 2318 loads[i] = NULL; 2319 stores[i] = NULL; 2320 } 2321 prevRecord = NULL; 2322 } 2323 2324 void 2325 MemoryOpt::reset() 2326 { 2327 for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) { 2328 Record *it, *next; 2329 for (it = loads[i]; it; it = next) { 2330 next = it->next; 2331 recordPool.release(it); 2332 } 2333 loads[i] = NULL; 2334 for (it = stores[i]; it; it = next) { 2335 next = it->next; 2336 recordPool.release(it); 2337 } 2338 stores[i] = NULL; 2339 } 2340 } 2341 2342 bool 2343 MemoryOpt::combineLd(Record *rec, Instruction *ld) 2344 { 2345 int32_t offRc = rec->offset; 2346 int32_t offLd = ld->getSrc(0)->reg.data.offset; 2347 int sizeRc = rec->size; 2348 int sizeLd = typeSizeof(ld->dType); 2349 int size = sizeRc + sizeLd; 2350 int d, j; 2351 2352 if (!prog->getTarget()-> 2353 isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size))) 2354 return false; 2355 // no unaligned loads 2356 if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) || 2357 ((size == 0xc) && (MIN2(offLd, offRc) & 0xf))) 2358 return false; 2359 // for compute indirect loads are not guaranteed to be aligned 2360 if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0]) 2361 return false; 2362 2363 assert(sizeRc + sizeLd <= 16 && offRc != offLd); 2364 2365 for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j); 2366 2367 if (offLd < offRc) { 2368 int sz; 2369 for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d); 2370 // d: nr of definitions in ld 2371 // j: nr of definitions in rec->insn, move: 2372 for (d = d + j - 1; j > 0; --j, --d) 2373 rec->insn->setDef(d, rec->insn->getDef(j - 1)); 2374 2375 if (rec->insn->getSrc(0)->refCount() > 1) 2376 rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0))); 2377 rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd; 2378 2379 d = 0; 2380 } else { 2381 d = j; 2382 } 2383 // move definitions of @ld to @rec->insn 2384 for (j = 0; sizeLd; ++j, ++d) { 2385 sizeLd -= ld->getDef(j)->reg.size; 2386 rec->insn->setDef(d, ld->getDef(j)); 2387 } 2388 2389 rec->size = size; 2390 rec->insn->getSrc(0)->reg.size = size; 2391 rec->insn->setType(typeOfSize(size)); 2392 2393 delete_Instruction(prog, ld); 2394 2395 return true; 2396 } 2397 2398 bool 2399 MemoryOpt::combineSt(Record *rec, Instruction *st) 2400 { 2401 int32_t offRc = rec->offset; 2402 int32_t offSt = st->getSrc(0)->reg.data.offset; 2403 int sizeRc = rec->size; 2404 int sizeSt = typeSizeof(st->dType); 2405 int s = sizeSt / 4; 2406 int size = sizeRc + sizeSt; 2407 int j, k; 2408 Value *src[4]; // no modifiers in ValueRef allowed for st 2409 Value *extra[3]; 2410 2411 if (!prog->getTarget()-> 2412 isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size))) 2413 return false; 2414 // no unaligned stores 2415 if (size == 8 && MIN2(offRc, offSt) & 0x7) 2416 return false; 2417 // for compute indirect stores are not guaranteed to be aligned 2418 if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0]) 2419 return false; 2420 2421 st->takeExtraSources(0, extra); // save predicate and indirect address 2422 2423 if (offRc < offSt) { 2424 // save values from @st 2425 for (s = 0; sizeSt; ++s) { 2426 sizeSt -= st->getSrc(s + 1)->reg.size; 2427 src[s] = st->getSrc(s + 1); 2428 } 2429 // set record's values as low sources of @st 2430 for (j = 1; sizeRc; ++j) { 2431 sizeRc -= rec->insn->getSrc(j)->reg.size; 2432 st->setSrc(j, rec->insn->getSrc(j)); 2433 } 2434 // set saved values as high sources of @st 2435 for (k = j, j = 0; j < s; ++j) 2436 st->setSrc(k++, src[j]); 2437 2438 updateLdStOffset(st, offRc, func); 2439 } else { 2440 for (j = 1; sizeSt; ++j) 2441 sizeSt -= st->getSrc(j)->reg.size; 2442 for (s = 1; sizeRc; ++j, ++s) { 2443 sizeRc -= rec->insn->getSrc(s)->reg.size; 2444 st->setSrc(j, rec->insn->getSrc(s)); 2445 } 2446 rec->offset = offSt; 2447 } 2448 st->putExtraSources(0, extra); // restore pointer and predicate 2449 2450 delete_Instruction(prog, rec->insn); 2451 rec->insn = st; 2452 rec->size = size; 2453 rec->insn->getSrc(0)->reg.size = size; 2454 rec->insn->setType(typeOfSize(size)); 2455 return true; 2456 } 2457 2458 void 2459 MemoryOpt::Record::set(const Instruction *ldst) 2460 { 2461 const Symbol *mem = ldst->getSrc(0)->asSym(); 2462 fileIndex = mem->reg.fileIndex; 2463 rel[0] = ldst->getIndirect(0, 0); 2464 rel[1] = ldst->getIndirect(0, 1); 2465 offset = mem->reg.data.offset; 2466 base = mem->getBase(); 2467 size = typeSizeof(ldst->sType); 2468 } 2469 2470 void 2471 MemoryOpt::Record::link(Record **list) 2472 { 2473 next = *list; 2474 if (next) 2475 next->prev = this; 2476 prev = NULL; 2477 *list = this; 2478 } 2479 2480 void 2481 MemoryOpt::Record::unlink(Record **list) 2482 { 2483 if (next) 2484 next->prev = prev; 2485 if (prev) 2486 prev->next = next; 2487 else 2488 *list = next; 2489 } 2490 2491 MemoryOpt::Record ** 2492 MemoryOpt::getList(const Instruction *insn) 2493 { 2494 if (insn->op == OP_LOAD || insn->op == OP_VFETCH) 2495 return &loads[insn->src(0).getFile()]; 2496 return &stores[insn->src(0).getFile()]; 2497 } 2498 2499 void 2500 MemoryOpt::addRecord(Instruction *i) 2501 { 2502 Record **list = getList(i); 2503 Record *it = reinterpret_cast<Record *>(recordPool.allocate()); 2504 2505 it->link(list); 2506 it->set(i); 2507 it->insn = i; 2508 it->locked = false; 2509 } 2510 2511 MemoryOpt::Record * 2512 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const 2513 { 2514 const Symbol *sym = insn->getSrc(0)->asSym(); 2515 const int size = typeSizeof(insn->sType); 2516 Record *rec = NULL; 2517 Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file]; 2518 2519 for (; it; it = it->next) { 2520 if (it->locked && insn->op != OP_LOAD) 2521 continue; 2522 if ((it->offset >> 4) != (sym->reg.data.offset >> 4) || 2523 it->rel[0] != insn->getIndirect(0, 0) || 2524 it->fileIndex != sym->reg.fileIndex || 2525 it->rel[1] != insn->getIndirect(0, 1)) 2526 continue; 2527 2528 if (it->offset < sym->reg.data.offset) { 2529 if (it->offset + it->size >= sym->reg.data.offset) { 2530 isAdj = (it->offset + it->size == sym->reg.data.offset); 2531 if (!isAdj) 2532 return it; 2533 if (!(it->offset & 0x7)) 2534 rec = it; 2535 } 2536 } else { 2537 isAdj = it->offset != sym->reg.data.offset; 2538 if (size <= it->size && !isAdj) 2539 return it; 2540 else 2541 if (!(sym->reg.data.offset & 0x7)) 2542 if (it->offset - size <= sym->reg.data.offset) 2543 rec = it; 2544 } 2545 } 2546 return rec; 2547 } 2548 2549 bool 2550 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec) 2551 { 2552 Instruction *st = rec->insn; 2553 int32_t offSt = rec->offset; 2554 int32_t offLd = ld->getSrc(0)->reg.data.offset; 2555 int d, s; 2556 2557 for (s = 1; offSt != offLd && st->srcExists(s); ++s) 2558 offSt += st->getSrc(s)->reg.size; 2559 if (offSt != offLd) 2560 return false; 2561 2562 for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) { 2563 if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size) 2564 return false; 2565 if (st->getSrc(s)->reg.file != FILE_GPR) 2566 return false; 2567 ld->def(d).replace(st->src(s), false); 2568 } 2569 ld->bb->remove(ld); 2570 return true; 2571 } 2572 2573 bool 2574 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec) 2575 { 2576 Instruction *ldR = rec->insn; 2577 int32_t offR = rec->offset; 2578 int32_t offE = ldE->getSrc(0)->reg.data.offset; 2579 int dR, dE; 2580 2581 assert(offR <= offE); 2582 for (dR = 0; offR < offE && ldR->defExists(dR); ++dR) 2583 offR += ldR->getDef(dR)->reg.size; 2584 if (offR != offE) 2585 return false; 2586 2587 for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) { 2588 if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size) 2589 return false; 2590 ldE->def(dE).replace(ldR->getDef(dR), false); 2591 } 2592 2593 delete_Instruction(prog, ldE); 2594 return true; 2595 } 2596 2597 bool 2598 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec) 2599 { 2600 const Instruction *const ri = rec->insn; 2601 Value *extra[3]; 2602 2603 int32_t offS = st->getSrc(0)->reg.data.offset; 2604 int32_t offR = rec->offset; 2605 int32_t endS = offS + typeSizeof(st->dType); 2606 int32_t endR = offR + typeSizeof(ri->dType); 2607 2608 rec->size = MAX2(endS, endR) - MIN2(offS, offR); 2609 2610 st->takeExtraSources(0, extra); 2611 2612 if (offR < offS) { 2613 Value *vals[10]; 2614 int s, n; 2615 int k = 0; 2616 // get non-replaced sources of ri 2617 for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s) 2618 vals[k++] = ri->getSrc(s); 2619 n = s; 2620 // get replaced sources of st 2621 for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s) 2622 vals[k++] = st->getSrc(s); 2623 // skip replaced sources of ri 2624 for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s); 2625 // get non-replaced sources after values covered by st 2626 for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s) 2627 vals[k++] = ri->getSrc(s); 2628 assert((unsigned int)k <= ARRAY_SIZE(vals)); 2629 for (s = 0; s < k; ++s) 2630 st->setSrc(s + 1, vals[s]); 2631 st->setSrc(0, ri->getSrc(0)); 2632 } else 2633 if (endR > endS) { 2634 int j, s; 2635 for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size); 2636 for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size); 2637 for (; offR < endR; offR += ri->getSrc(j++)->reg.size) 2638 st->setSrc(s++, ri->getSrc(j)); 2639 } 2640 st->putExtraSources(0, extra); 2641 2642 delete_Instruction(prog, rec->insn); 2643 2644 rec->insn = st; 2645 rec->offset = st->getSrc(0)->reg.data.offset; 2646 2647 st->setType(typeOfSize(rec->size)); 2648 2649 return true; 2650 } 2651 2652 bool 2653 MemoryOpt::Record::overlaps(const Instruction *ldst) const 2654 { 2655 Record that; 2656 that.set(ldst); 2657 2658 if (this->fileIndex != that.fileIndex) 2659 return false; 2660 2661 if (this->rel[0] || that.rel[0]) 2662 return this->base == that.base; 2663 return 2664 (this->offset < that.offset + that.size) && 2665 (this->offset + this->size > that.offset); 2666 } 2667 2668 // We must not eliminate stores that affect the result of @ld if 2669 // we find later stores to the same location, and we may no longer 2670 // merge them with later stores. 2671 // The stored value can, however, still be used to determine the value 2672 // returned by future loads. 2673 void 2674 MemoryOpt::lockStores(Instruction *const ld) 2675 { 2676 for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next) 2677 if (!r->locked && r->overlaps(ld)) 2678 r->locked = true; 2679 } 2680 2681 // Prior loads from the location of @st are no longer valid. 2682 // Stores to the location of @st may no longer be used to derive 2683 // the value at it nor be coalesced into later stores. 2684 void 2685 MemoryOpt::purgeRecords(Instruction *const st, DataFile f) 2686 { 2687 if (st) 2688 f = st->src(0).getFile(); 2689 2690 for (Record *r = loads[f]; r; r = r->next) 2691 if (!st || r->overlaps(st)) 2692 r->unlink(&loads[f]); 2693 2694 for (Record *r = stores[f]; r; r = r->next) 2695 if (!st || r->overlaps(st)) 2696 r->unlink(&stores[f]); 2697 } 2698 2699 bool 2700 MemoryOpt::visit(BasicBlock *bb) 2701 { 2702 bool ret = runOpt(bb); 2703 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st 2704 // where 96 bit memory operations are forbidden. 2705 if (ret) 2706 ret = runOpt(bb); 2707 return ret; 2708 } 2709 2710 bool 2711 MemoryOpt::runOpt(BasicBlock *bb) 2712 { 2713 Instruction *ldst, *next; 2714 Record *rec; 2715 bool isAdjacent = true; 2716 2717 for (ldst = bb->getEntry(); ldst; ldst = next) { 2718 bool keep = true; 2719 bool isLoad = true; 2720 next = ldst->next; 2721 2722 if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) { 2723 if (ldst->isDead()) { 2724 // might have been produced by earlier optimization 2725 delete_Instruction(prog, ldst); 2726 continue; 2727 } 2728 } else 2729 if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) { 2730 if (typeSizeof(ldst->dType) == 4 && 2731 ldst->src(1).getFile() == FILE_GPR && 2732 ldst->getSrc(1)->getInsn()->op == OP_NOP) { 2733 delete_Instruction(prog, ldst); 2734 continue; 2735 } 2736 isLoad = false; 2737 } else { 2738 // TODO: maybe have all fixed ops act as barrier ? 2739 if (ldst->op == OP_CALL || 2740 ldst->op == OP_BAR || 2741 ldst->op == OP_MEMBAR) { 2742 purgeRecords(NULL, FILE_MEMORY_LOCAL); 2743 purgeRecords(NULL, FILE_MEMORY_GLOBAL); 2744 purgeRecords(NULL, FILE_MEMORY_SHARED); 2745 purgeRecords(NULL, FILE_SHADER_OUTPUT); 2746 } else 2747 if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) { 2748 if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) { 2749 purgeRecords(NULL, FILE_MEMORY_LOCAL); 2750 purgeRecords(NULL, FILE_MEMORY_GLOBAL); 2751 purgeRecords(NULL, FILE_MEMORY_SHARED); 2752 } else { 2753 purgeRecords(NULL, ldst->src(0).getFile()); 2754 } 2755 } else 2756 if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) { 2757 purgeRecords(NULL, FILE_SHADER_OUTPUT); 2758 } 2759 continue; 2760 } 2761 if (ldst->getPredicate()) // TODO: handle predicated ld/st 2762 continue; 2763 if (ldst->perPatch) // TODO: create separate per-patch lists 2764 continue; 2765 2766 if (isLoad) { 2767 DataFile file = ldst->src(0).getFile(); 2768 2769 // if ld l[]/g[] look for previous store to eliminate the reload 2770 if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) { 2771 // TODO: shared memory ? 2772 rec = findRecord(ldst, false, isAdjacent); 2773 if (rec && !isAdjacent) 2774 keep = !replaceLdFromSt(ldst, rec); 2775 } 2776 2777 // or look for ld from the same location and replace this one 2778 rec = keep ? findRecord(ldst, true, isAdjacent) : NULL; 2779 if (rec) { 2780 if (!isAdjacent) 2781 keep = !replaceLdFromLd(ldst, rec); 2782 else 2783 // or combine a previous load with this one 2784 keep = !combineLd(rec, ldst); 2785 } 2786 if (keep) 2787 lockStores(ldst); 2788 } else { 2789 rec = findRecord(ldst, false, isAdjacent); 2790 if (rec) { 2791 if (!isAdjacent) 2792 keep = !replaceStFromSt(ldst, rec); 2793 else 2794 keep = !combineSt(rec, ldst); 2795 } 2796 if (keep) 2797 purgeRecords(ldst, DATA_FILE_COUNT); 2798 } 2799 if (keep) 2800 addRecord(ldst); 2801 } 2802 reset(); 2803 2804 return true; 2805 } 2806 2807 // ============================================================================= 2808 2809 // Turn control flow into predicated instructions (after register allocation !). 2810 // TODO: 2811 // Could move this to before register allocation on NVC0 and also handle nested 2812 // constructs. 2813 class FlatteningPass : public Pass 2814 { 2815 private: 2816 virtual bool visit(Function *); 2817 virtual bool visit(BasicBlock *); 2818 2819 bool tryPredicateConditional(BasicBlock *); 2820 void predicateInstructions(BasicBlock *, Value *pred, CondCode cc); 2821 void tryPropagateBranch(BasicBlock *); 2822 inline bool isConstantCondition(Value *pred); 2823 inline bool mayPredicate(const Instruction *, const Value *pred) const; 2824 inline void removeFlow(Instruction *); 2825 2826 uint8_t gpr_unit; 2827 }; 2828 2829 bool 2830 FlatteningPass::isConstantCondition(Value *pred) 2831 { 2832 Instruction *insn = pred->getUniqueInsn(); 2833 assert(insn); 2834 if (insn->op != OP_SET || insn->srcExists(2)) 2835 return false; 2836 2837 for (int s = 0; s < 2 && insn->srcExists(s); ++s) { 2838 Instruction *ld = insn->getSrc(s)->getUniqueInsn(); 2839 DataFile file; 2840 if (ld) { 2841 if (ld->op != OP_MOV && ld->op != OP_LOAD) 2842 return false; 2843 if (ld->src(0).isIndirect(0)) 2844 return false; 2845 file = ld->src(0).getFile(); 2846 } else { 2847 file = insn->src(s).getFile(); 2848 // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is 2849 // in register "units", which can vary between targets. 2850 if (file == FILE_GPR) { 2851 Value *v = insn->getSrc(s); 2852 int bytes = v->reg.data.id * MIN2(v->reg.size, 4); 2853 int units = bytes >> gpr_unit; 2854 if (units > prog->maxGPR) 2855 file = FILE_IMMEDIATE; 2856 } 2857 } 2858 if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST) 2859 return false; 2860 } 2861 return true; 2862 } 2863 2864 void 2865 FlatteningPass::removeFlow(Instruction *insn) 2866 { 2867 FlowInstruction *term = insn ? insn->asFlow() : NULL; 2868 if (!term) 2869 return; 2870 Graph::Edge::Type ty = term->bb->cfg.outgoing().getType(); 2871 2872 if (term->op == OP_BRA) { 2873 // TODO: this might get more difficult when we get arbitrary BRAs 2874 if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK) 2875 return; 2876 } else 2877 if (term->op != OP_JOIN) 2878 return; 2879 2880 Value *pred = term->getPredicate(); 2881 2882 delete_Instruction(prog, term); 2883 2884 if (pred && pred->refCount() == 0) { 2885 Instruction *pSet = pred->getUniqueInsn(); 2886 pred->join->reg.data.id = -1; // deallocate 2887 if (pSet->isDead()) 2888 delete_Instruction(prog, pSet); 2889 } 2890 } 2891 2892 void 2893 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc) 2894 { 2895 for (Instruction *i = bb->getEntry(); i; i = i->next) { 2896 if (i->isNop()) 2897 continue; 2898 assert(!i->getPredicate()); 2899 i->setPredicate(cc, pred); 2900 } 2901 removeFlow(bb->getExit()); 2902 } 2903 2904 bool 2905 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const 2906 { 2907 if (insn->isPseudo()) 2908 return true; 2909 // TODO: calls where we don't know which registers are modified 2910 2911 if (!prog->getTarget()->mayPredicate(insn, pred)) 2912 return false; 2913 for (int d = 0; insn->defExists(d); ++d) 2914 if (insn->getDef(d)->equals(pred)) 2915 return false; 2916 return true; 2917 } 2918 2919 // If we jump to BRA/RET/EXIT, replace the jump with it. 2920 // NOTE: We do not update the CFG anymore here ! 2921 // 2922 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?): 2923 // BB:0 2924 // @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1 2925 // BB1: 2926 // bra BB:3 2927 // BB2: 2928 // ... 2929 // BB3: 2930 // ... 2931 void 2932 FlatteningPass::tryPropagateBranch(BasicBlock *bb) 2933 { 2934 for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) { 2935 BasicBlock *bf = i->asFlow()->target.bb; 2936 2937 if (bf->getInsnCount() != 1) 2938 continue; 2939 2940 FlowInstruction *bra = i->asFlow(); 2941 FlowInstruction *rep = bf->getExit()->asFlow(); 2942 2943 if (!rep || rep->getPredicate()) 2944 continue; 2945 if (rep->op != OP_BRA && 2946 rep->op != OP_JOIN && 2947 rep->op != OP_EXIT) 2948 continue; 2949 2950 // TODO: If there are multiple branches to @rep, only the first would 2951 // be replaced, so only remove them after this pass is done ? 2952 // Also, need to check all incident blocks for fall-through exits and 2953 // add the branch there. 2954 bra->op = rep->op; 2955 bra->target.bb = rep->target.bb; 2956 if (bf->cfg.incidentCount() == 1) 2957 bf->remove(rep); 2958 } 2959 } 2960 2961 bool 2962 FlatteningPass::visit(Function *fn) 2963 { 2964 gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR); 2965 2966 return true; 2967 } 2968 2969 bool 2970 FlatteningPass::visit(BasicBlock *bb) 2971 { 2972 if (tryPredicateConditional(bb)) 2973 return true; 2974 2975 // try to attach join to previous instruction 2976 if (prog->getTarget()->hasJoin) { 2977 Instruction *insn = bb->getExit(); 2978 if (insn && insn->op == OP_JOIN && !insn->getPredicate()) { 2979 insn = insn->prev; 2980 if (insn && !insn->getPredicate() && 2981 !insn->asFlow() && 2982 insn->op != OP_DISCARD && 2983 insn->op != OP_TEXBAR && 2984 !isTextureOp(insn->op) && // probably just nve4 2985 !isSurfaceOp(insn->op) && // not confirmed 2986 insn->op != OP_LINTERP && // probably just nve4 2987 insn->op != OP_PINTERP && // probably just nve4 2988 ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) || 2989 (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) && 2990 !insn->isNop()) { 2991 insn->join = 1; 2992 bb->remove(bb->getExit()); 2993 return true; 2994 } 2995 } 2996 } 2997 2998 tryPropagateBranch(bb); 2999 3000 return true; 3001 } 3002 3003 bool 3004 FlatteningPass::tryPredicateConditional(BasicBlock *bb) 3005 { 3006 BasicBlock *bL = NULL, *bR = NULL; 3007 unsigned int nL = 0, nR = 0, limit = 12; 3008 Instruction *insn; 3009 unsigned int mask; 3010 3011 mask = bb->initiatesSimpleConditional(); 3012 if (!mask) 3013 return false; 3014 3015 assert(bb->getExit()); 3016 Value *pred = bb->getExit()->getPredicate(); 3017 assert(pred); 3018 3019 if (isConstantCondition(pred)) 3020 limit = 4; 3021 3022 Graph::EdgeIterator ei = bb->cfg.outgoing(); 3023 3024 if (mask & 1) { 3025 bL = BasicBlock::get(ei.getNode()); 3026 for (insn = bL->getEntry(); insn; insn = insn->next, ++nL) 3027 if (!mayPredicate(insn, pred)) 3028 return false; 3029 if (nL > limit) 3030 return false; // too long, do a real branch 3031 } 3032 ei.next(); 3033 3034 if (mask & 2) { 3035 bR = BasicBlock::get(ei.getNode()); 3036 for (insn = bR->getEntry(); insn; insn = insn->next, ++nR) 3037 if (!mayPredicate(insn, pred)) 3038 return false; 3039 if (nR > limit) 3040 return false; // too long, do a real branch 3041 } 3042 3043 if (bL) 3044 predicateInstructions(bL, pred, bb->getExit()->cc); 3045 if (bR) 3046 predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc)); 3047 3048 if (bb->joinAt) { 3049 bb->remove(bb->joinAt); 3050 bb->joinAt = NULL; 3051 } 3052 removeFlow(bb->getExit()); // delete the branch/join at the fork point 3053 3054 // remove potential join operations at the end of the conditional 3055 if (prog->getTarget()->joinAnterior) { 3056 bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode()); 3057 if (bb->getEntry() && bb->getEntry()->op == OP_JOIN) 3058 removeFlow(bb->getEntry()); 3059 } 3060 3061 return true; 3062 } 3063 3064 // ============================================================================= 3065 3066 // Fold Immediate into MAD; must be done after register allocation due to 3067 // constraint SDST == SSRC2 3068 // TODO: 3069 // Does NVC0+ have other situations where this pass makes sense? 3070 class NV50PostRaConstantFolding : public Pass 3071 { 3072 private: 3073 virtual bool visit(BasicBlock *); 3074 }; 3075 3076 static bool 3077 post_ra_dead(Instruction *i) 3078 { 3079 for (int d = 0; i->defExists(d); ++d) 3080 if (i->getDef(d)->refCount()) 3081 return false; 3082 return true; 3083 } 3084 3085 bool 3086 NV50PostRaConstantFolding::visit(BasicBlock *bb) 3087 { 3088 Value *vtmp; 3089 Instruction *def; 3090 3091 for (Instruction *i = bb->getFirst(); i; i = i->next) { 3092 switch (i->op) { 3093 case OP_MAD: 3094 if (i->def(0).getFile() != FILE_GPR || 3095 i->src(0).getFile() != FILE_GPR || 3096 i->src(1).getFile() != FILE_GPR || 3097 i->src(2).getFile() != FILE_GPR || 3098 i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id) 3099 break; 3100 3101 if (i->getDef(0)->reg.data.id >= 64 || 3102 i->getSrc(0)->reg.data.id >= 64) 3103 break; 3104 3105 if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0) 3106 break; 3107 3108 if (i->getPredicate()) 3109 break; 3110 3111 def = i->getSrc(1)->getInsn(); 3112 if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4) 3113 def = def->getSrc(0)->getInsn(); 3114 if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { 3115 vtmp = i->getSrc(1); 3116 if (isFloatType(i->sType)) { 3117 i->setSrc(1, def->getSrc(0)); 3118 } else { 3119 ImmediateValue val; 3120 bool ret = def->src(0).getImmediate(val); 3121 assert(ret); 3122 if (i->getSrc(1)->reg.data.id & 1) 3123 val.reg.data.u32 >>= 16; 3124 val.reg.data.u32 &= 0xffff; 3125 i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32)); 3126 } 3127 3128 /* There's no post-RA dead code elimination, so do it here 3129 * XXX: if we add more code-removing post-RA passes, we might 3130 * want to create a post-RA dead-code elim pass */ 3131 if (post_ra_dead(vtmp->getInsn())) { 3132 Value *src = vtmp->getInsn()->getSrc(0); 3133 // Careful -- splits will have already been removed from the 3134 // functions. Don't double-delete. 3135 if (vtmp->getInsn()->bb) 3136 delete_Instruction(prog, vtmp->getInsn()); 3137 if (src->getInsn() && post_ra_dead(src->getInsn())) 3138 delete_Instruction(prog, src->getInsn()); 3139 } 3140 3141 break; 3142 } 3143 break; 3144 default: 3145 break; 3146 } 3147 } 3148 3149 return true; 3150 } 3151 3152 // ============================================================================= 3153 3154 // Common subexpression elimination. Stupid O^2 implementation. 3155 class LocalCSE : public Pass 3156 { 3157 private: 3158 virtual bool visit(BasicBlock *); 3159 3160 inline bool tryReplace(Instruction **, Instruction *); 3161 3162 DLList ops[OP_LAST + 1]; 3163 }; 3164 3165 class GlobalCSE : public Pass 3166 { 3167 private: 3168 virtual bool visit(BasicBlock *); 3169 }; 3170 3171 bool 3172 Instruction::isActionEqual(const Instruction *that) const 3173 { 3174 if (this->op != that->op || 3175 this->dType != that->dType || 3176 this->sType != that->sType) 3177 return false; 3178 if (this->cc != that->cc) 3179 return false; 3180 3181 if (this->asTex()) { 3182 if (memcmp(&this->asTex()->tex, 3183 &that->asTex()->tex, 3184 sizeof(this->asTex()->tex))) 3185 return false; 3186 } else 3187 if (this->asCmp()) { 3188 if (this->asCmp()->setCond != that->asCmp()->setCond) 3189 return false; 3190 } else 3191 if (this->asFlow()) { 3192 return false; 3193 } else { 3194 if (this->ipa != that->ipa || 3195 this->lanes != that->lanes || 3196 this->perPatch != that->perPatch) 3197 return false; 3198 if (this->postFactor != that->postFactor) 3199 return false; 3200 } 3201 3202 if (this->subOp != that->subOp || 3203 this->saturate != that->saturate || 3204 this->rnd != that->rnd || 3205 this->ftz != that->ftz || 3206 this->dnz != that->dnz || 3207 this->cache != that->cache || 3208 this->mask != that->mask) 3209 return false; 3210 3211 return true; 3212 } 3213 3214 bool 3215 Instruction::isResultEqual(const Instruction *that) const 3216 { 3217 unsigned int d, s; 3218 3219 // NOTE: location of discard only affects tex with liveOnly and quadops 3220 if (!this->defExists(0) && this->op != OP_DISCARD) 3221 return false; 3222 3223 if (!isActionEqual(that)) 3224 return false; 3225 3226 if (this->predSrc != that->predSrc) 3227 return false; 3228 3229 for (d = 0; this->defExists(d); ++d) { 3230 if (!that->defExists(d) || 3231 !this->getDef(d)->equals(that->getDef(d), false)) 3232 return false; 3233 } 3234 if (that->defExists(d)) 3235 return false; 3236 3237 for (s = 0; this->srcExists(s); ++s) { 3238 if (!that->srcExists(s)) 3239 return false; 3240 if (this->src(s).mod != that->src(s).mod) 3241 return false; 3242 if (!this->getSrc(s)->equals(that->getSrc(s), true)) 3243 return false; 3244 } 3245 if (that->srcExists(s)) 3246 return false; 3247 3248 if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) { 3249 switch (src(0).getFile()) { 3250 case FILE_MEMORY_CONST: 3251 case FILE_SHADER_INPUT: 3252 return true; 3253 case FILE_SHADER_OUTPUT: 3254 return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL; 3255 default: 3256 return false; 3257 } 3258 } 3259 3260 return true; 3261 } 3262 3263 // pull through common expressions from different in-blocks 3264 bool 3265 GlobalCSE::visit(BasicBlock *bb) 3266 { 3267 Instruction *phi, *next, *ik; 3268 int s; 3269 3270 // TODO: maybe do this with OP_UNION, too 3271 3272 for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) { 3273 next = phi->next; 3274 if (phi->getSrc(0)->refCount() > 1) 3275 continue; 3276 ik = phi->getSrc(0)->getInsn(); 3277 if (!ik) 3278 continue; // probably a function input 3279 if (ik->defCount(0xff) > 1) 3280 continue; // too painful to check if we can really push this forward 3281 for (s = 1; phi->srcExists(s); ++s) { 3282 if (phi->getSrc(s)->refCount() > 1) 3283 break; 3284 if (!phi->getSrc(s)->getInsn() || 3285 !phi->getSrc(s)->getInsn()->isResultEqual(ik)) 3286 break; 3287 } 3288 if (!phi->srcExists(s)) { 3289 Instruction *entry = bb->getEntry(); 3290 ik->bb->remove(ik); 3291 if (!entry || entry->op != OP_JOIN) 3292 bb->insertHead(ik); 3293 else 3294 bb->insertAfter(entry, ik); 3295 ik->setDef(0, phi->getDef(0)); 3296 delete_Instruction(prog, phi); 3297 } 3298 } 3299 3300 return true; 3301 } 3302 3303 bool 3304 LocalCSE::tryReplace(Instruction **ptr, Instruction *i) 3305 { 3306 Instruction *old = *ptr; 3307 3308 // TODO: maybe relax this later (causes trouble with OP_UNION) 3309 if (i->isPredicated()) 3310 return false; 3311 3312 if (!old->isResultEqual(i)) 3313 return false; 3314 3315 for (int d = 0; old->defExists(d); ++d) 3316 old->def(d).replace(i->getDef(d), false); 3317 delete_Instruction(prog, old); 3318 *ptr = NULL; 3319 return true; 3320 } 3321 3322 bool 3323 LocalCSE::visit(BasicBlock *bb) 3324 { 3325 unsigned int replaced; 3326 3327 do { 3328 Instruction *ir, *next; 3329 3330 replaced = 0; 3331 3332 // will need to know the order of instructions 3333 int serial = 0; 3334 for (ir = bb->getFirst(); ir; ir = ir->next) 3335 ir->serial = serial++; 3336 3337 for (ir = bb->getFirst(); ir; ir = next) { 3338 int s; 3339 Value *src = NULL; 3340 3341 next = ir->next; 3342 3343 if (ir->fixed) { 3344 ops[ir->op].insert(ir); 3345 continue; 3346 } 3347 3348 for (s = 0; ir->srcExists(s); ++s) 3349 if (ir->getSrc(s)->asLValue()) 3350 if (!src || ir->getSrc(s)->refCount() < src->refCount()) 3351 src = ir->getSrc(s); 3352 3353 if (src) { 3354 for (Value::UseIterator it = src->uses.begin(); 3355 it != src->uses.end(); ++it) { 3356 Instruction *ik = (*it)->getInsn(); 3357 if (ik && ik->bb == ir->bb && ik->serial < ir->serial) 3358 if (tryReplace(&ir, ik)) 3359 break; 3360 } 3361 } else { 3362 DLLIST_FOR_EACH(&ops[ir->op], iter) 3363 { 3364 Instruction *ik = reinterpret_cast<Instruction *>(iter.get()); 3365 if (tryReplace(&ir, ik)) 3366 break; 3367 } 3368 } 3369 3370 if (ir) 3371 ops[ir->op].insert(ir); 3372 else 3373 ++replaced; 3374 } 3375 for (unsigned int i = 0; i <= OP_LAST; ++i) 3376 ops[i].clear(); 3377 3378 } while (replaced); 3379 3380 return true; 3381 } 3382 3383 // ============================================================================= 3384 3385 // Remove computations of unused values. 3386 class DeadCodeElim : public Pass 3387 { 3388 public: 3389 bool buryAll(Program *); 3390 3391 private: 3392 virtual bool visit(BasicBlock *); 3393 3394 void checkSplitLoad(Instruction *ld); // for partially dead loads 3395 3396 unsigned int deadCount; 3397 }; 3398 3399 bool 3400 DeadCodeElim::buryAll(Program *prog) 3401 { 3402 do { 3403 deadCount = 0; 3404 if (!this->run(prog, false, false)) 3405 return false; 3406 } while (deadCount); 3407 3408 return true; 3409 } 3410 3411 bool 3412 DeadCodeElim::visit(BasicBlock *bb) 3413 { 3414 Instruction *prev; 3415 3416 for (Instruction *i = bb->getExit(); i; i = prev) { 3417 prev = i->prev; 3418 if (i->isDead()) { 3419 ++deadCount; 3420 delete_Instruction(prog, i); 3421 } else 3422 if (i->defExists(1) && 3423 i->subOp == 0 && 3424 (i->op == OP_VFETCH || i->op == OP_LOAD)) { 3425 checkSplitLoad(i); 3426 } else 3427 if (i->defExists(0) && !i->getDef(0)->refCount()) { 3428 if (i->op == OP_ATOM || 3429 i->op == OP_SUREDP || 3430 i->op == OP_SUREDB) { 3431 i->setDef(0, NULL); 3432 } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) { 3433 i->setDef(0, i->getDef(1)); 3434 i->setDef(1, NULL); 3435 } 3436 } 3437 } 3438 return true; 3439 } 3440 3441 // Each load can go into up to 4 destinations, any of which might potentially 3442 // be dead (i.e. a hole). These can always be split into 2 loads, independent 3443 // of where the holes are. We find the first contiguous region, put it into 3444 // the first load, and then put the second contiguous region into the second 3445 // load. There can be at most 2 contiguous regions. 3446 // 3447 // Note that there are some restrictions, for example it's not possible to do 3448 // a 64-bit load that's not 64-bit aligned, so such a load has to be split 3449 // up. Also hardware doesn't support 96-bit loads, so those also have to be 3450 // split into a 64-bit and 32-bit load. 3451 void 3452 DeadCodeElim::checkSplitLoad(Instruction *ld1) 3453 { 3454 Instruction *ld2 = NULL; // can get at most 2 loads 3455 Value *def1[4]; 3456 Value *def2[4]; 3457 int32_t addr1, addr2; 3458 int32_t size1, size2; 3459 int d, n1, n2; 3460 uint32_t mask = 0xffffffff; 3461 3462 for (d = 0; ld1->defExists(d); ++d) 3463 if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0) 3464 mask &= ~(1 << d); 3465 if (mask == 0xffffffff) 3466 return; 3467 3468 addr1 = ld1->getSrc(0)->reg.data.offset; 3469 n1 = n2 = 0; 3470 size1 = size2 = 0; 3471 3472 // Compute address/width for first load 3473 for (d = 0; ld1->defExists(d); ++d) { 3474 if (mask & (1 << d)) { 3475 if (size1 && (addr1 & 0x7)) 3476 break; 3477 def1[n1] = ld1->getDef(d); 3478 size1 += def1[n1++]->reg.size; 3479 } else 3480 if (!n1) { 3481 addr1 += ld1->getDef(d)->reg.size; 3482 } else { 3483 break; 3484 } 3485 } 3486 3487 // Scale back the size of the first load until it can be loaded. This 3488 // typically happens for TYPE_B96 loads. 3489 while (n1 && 3490 !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file, 3491 typeOfSize(size1))) { 3492 size1 -= def1[--n1]->reg.size; 3493 d--; 3494 } 3495 3496 // Compute address/width for second load 3497 for (addr2 = addr1 + size1; ld1->defExists(d); ++d) { 3498 if (mask & (1 << d)) { 3499 assert(!size2 || !(addr2 & 0x7)); 3500 def2[n2] = ld1->getDef(d); 3501 size2 += def2[n2++]->reg.size; 3502 } else if (!n2) { 3503 assert(!n2); 3504 addr2 += ld1->getDef(d)->reg.size; 3505 } else { 3506 break; 3507 } 3508 } 3509 3510 // Make sure that we've processed all the values 3511 for (; ld1->defExists(d); ++d) 3512 assert(!(mask & (1 << d))); 3513 3514 updateLdStOffset(ld1, addr1, func); 3515 ld1->setType(typeOfSize(size1)); 3516 for (d = 0; d < 4; ++d) 3517 ld1->setDef(d, (d < n1) ? def1[d] : NULL); 3518 3519 if (!n2) 3520 return; 3521 3522 ld2 = cloneShallow(func, ld1); 3523 updateLdStOffset(ld2, addr2, func); 3524 ld2->setType(typeOfSize(size2)); 3525 for (d = 0; d < 4; ++d) 3526 ld2->setDef(d, (d < n2) ? def2[d] : NULL); 3527 3528 ld1->bb->insertAfter(ld1, ld2); 3529 } 3530 3531 // ============================================================================= 3532 3533 #define RUN_PASS(l, n, f) \ 3534 if (level >= (l)) { \ 3535 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \ 3536 INFO("PEEPHOLE: %s\n", #n); \ 3537 n pass; \ 3538 if (!pass.f(this)) \ 3539 return false; \ 3540 } 3541 3542 bool 3543 Program::optimizeSSA(int level) 3544 { 3545 RUN_PASS(1, DeadCodeElim, buryAll); 3546 RUN_PASS(1, CopyPropagation, run); 3547 RUN_PASS(1, MergeSplits, run); 3548 RUN_PASS(2, GlobalCSE, run); 3549 RUN_PASS(1, LocalCSE, run); 3550 RUN_PASS(2, AlgebraicOpt, run); 3551 RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks 3552 RUN_PASS(1, ConstantFolding, foldAll); 3553 RUN_PASS(2, LateAlgebraicOpt, run); 3554 RUN_PASS(1, LoadPropagation, run); 3555 RUN_PASS(1, IndirectPropagation, run); 3556 RUN_PASS(2, MemoryOpt, run); 3557 RUN_PASS(2, LocalCSE, run); 3558 RUN_PASS(0, DeadCodeElim, buryAll); 3559 3560 return true; 3561 } 3562 3563 bool 3564 Program::optimizePostRA(int level) 3565 { 3566 RUN_PASS(2, FlatteningPass, run); 3567 if (getTarget()->getChipset() < 0xc0) 3568 RUN_PASS(2, NV50PostRaConstantFolding, run); 3569 3570 return true; 3571 } 3572 3573 } 3574