1 /* 2 * Copyright 2011 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23 #include "nv50_ir.h" 24 #include "nv50_ir_target.h" 25 #include "nv50_ir_build_util.h" 26 27 extern "C" { 28 #include "util/u_math.h" 29 } 30 31 namespace nv50_ir { 32 33 bool 34 Instruction::isNop() const 35 { 36 if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT) 37 return true; 38 if (terminator || join) // XXX: should terminator imply flow ? 39 return false; 40 if (!fixed && op == OP_NOP) 41 return true; 42 43 if (defExists(0) && def(0).rep()->reg.data.id < 0) { 44 for (int d = 1; defExists(d); ++d) 45 if (def(d).rep()->reg.data.id >= 0) 46 WARN("part of vector result is unused !\n"); 47 return true; 48 } 49 50 if (op == OP_MOV || op == OP_UNION) { 51 if (!getDef(0)->equals(getSrc(0))) 52 return false; 53 if (op == OP_UNION) 54 if (!def(0).rep()->equals(getSrc(1))) 55 return false; 56 return true; 57 } 58 59 return false; 60 } 61 62 bool Instruction::isDead() const 63 { 64 if (op == OP_STORE || 65 op == OP_EXPORT || 66 op == OP_WRSV) 67 return false; 68 69 for (int d = 0; defExists(d); ++d) 70 if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0) 71 return false; 72 73 if (terminator || asFlow()) 74 return false; 75 if (fixed) 76 return false; 77 78 return true; 79 }; 80 81 // ============================================================================= 82 83 class CopyPropagation : public Pass 84 { 85 private: 86 virtual bool visit(BasicBlock *); 87 }; 88 89 // Propagate all MOVs forward to make subsequent optimization easier, except if 90 // the sources stem from a phi, in which case we don't want to mess up potential 91 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def. 92 bool 93 CopyPropagation::visit(BasicBlock *bb) 94 { 95 Instruction *mov, *si, *next; 96 97 for (mov = bb->getEntry(); mov; mov = next) { 98 next = mov->next; 99 if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue()) 100 continue; 101 if (mov->getPredicate()) 102 continue; 103 if (mov->def(0).getFile() != mov->src(0).getFile()) 104 continue; 105 si = mov->getSrc(0)->getInsn(); 106 if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) { 107 // propagate 108 mov->def(0).replace(mov->getSrc(0), false); 109 delete_Instruction(prog, mov); 110 } 111 } 112 return true; 113 } 114 115 // ============================================================================= 116 117 class LoadPropagation : public Pass 118 { 119 private: 120 virtual bool visit(BasicBlock *); 121 122 void checkSwapSrc01(Instruction *); 123 124 bool isCSpaceLoad(Instruction *); 125 bool isImmd32Load(Instruction *); 126 bool isAttribOrSharedLoad(Instruction *); 127 }; 128 129 bool 130 LoadPropagation::isCSpaceLoad(Instruction *ld) 131 { 132 return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST; 133 } 134 135 bool 136 LoadPropagation::isImmd32Load(Instruction *ld) 137 { 138 if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4)) 139 return false; 140 return ld->src(0).getFile() == FILE_IMMEDIATE; 141 } 142 143 bool 144 LoadPropagation::isAttribOrSharedLoad(Instruction *ld) 145 { 146 return ld && 147 (ld->op == OP_VFETCH || 148 (ld->op == OP_LOAD && 149 (ld->src(0).getFile() == FILE_SHADER_INPUT || 150 ld->src(0).getFile() == FILE_MEMORY_SHARED))); 151 } 152 153 void 154 LoadPropagation::checkSwapSrc01(Instruction *insn) 155 { 156 if (!prog->getTarget()->getOpInfo(insn).commutative) 157 if (insn->op != OP_SET && insn->op != OP_SLCT) 158 return; 159 if (insn->src(1).getFile() != FILE_GPR) 160 return; 161 162 Instruction *i0 = insn->getSrc(0)->getInsn(); 163 Instruction *i1 = insn->getSrc(1)->getInsn(); 164 165 if (isCSpaceLoad(i0)) { 166 if (!isCSpaceLoad(i1)) 167 insn->swapSources(0, 1); 168 else 169 return; 170 } else 171 if (isImmd32Load(i0)) { 172 if (!isCSpaceLoad(i1) && !isImmd32Load(i1)) 173 insn->swapSources(0, 1); 174 else 175 return; 176 } else 177 if (isAttribOrSharedLoad(i1)) { 178 if (!isAttribOrSharedLoad(i0)) 179 insn->swapSources(0, 1); 180 else 181 return; 182 } else { 183 return; 184 } 185 186 if (insn->op == OP_SET) 187 insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond); 188 else 189 if (insn->op == OP_SLCT) 190 insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond); 191 } 192 193 bool 194 LoadPropagation::visit(BasicBlock *bb) 195 { 196 const Target *targ = prog->getTarget(); 197 Instruction *next; 198 199 for (Instruction *i = bb->getEntry(); i; i = next) { 200 next = i->next; 201 202 if (i->srcExists(1)) 203 checkSwapSrc01(i); 204 205 for (int s = 0; i->srcExists(s); ++s) { 206 Instruction *ld = i->getSrc(s)->getInsn(); 207 208 if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV)) 209 continue; 210 if (!targ->insnCanLoad(i, s, ld)) 211 continue; 212 213 // propagate ! 214 i->setSrc(s, ld->getSrc(0)); 215 if (ld->src(0).isIndirect(0)) 216 i->setIndirect(s, 0, ld->getIndirect(0, 0)); 217 218 if (ld->getDef(0)->refCount() == 0) 219 delete_Instruction(prog, ld); 220 } 221 } 222 return true; 223 } 224 225 // ============================================================================= 226 227 // Evaluate constant expressions. 228 class ConstantFolding : public Pass 229 { 230 public: 231 bool foldAll(Program *); 232 233 private: 234 virtual bool visit(BasicBlock *); 235 236 void expr(Instruction *, ImmediateValue&, ImmediateValue&); 237 void opnd(Instruction *, ImmediateValue&, int s); 238 239 void unary(Instruction *, const ImmediateValue&); 240 241 void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&); 242 243 // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET 244 CmpInstruction *findOriginForTestWithZero(Value *); 245 246 unsigned int foldCount; 247 248 BuildUtil bld; 249 }; 250 251 // TODO: remember generated immediates and only revisit these 252 bool 253 ConstantFolding::foldAll(Program *prog) 254 { 255 unsigned int iterCount = 0; 256 do { 257 foldCount = 0; 258 if (!run(prog)) 259 return false; 260 } while (foldCount && ++iterCount < 2); 261 return true; 262 } 263 264 bool 265 ConstantFolding::visit(BasicBlock *bb) 266 { 267 Instruction *i, *next; 268 269 for (i = bb->getEntry(); i; i = next) { 270 next = i->next; 271 if (i->op == OP_MOV || i->op == OP_CALL) 272 continue; 273 274 ImmediateValue src0, src1; 275 276 if (i->srcExists(1) && 277 i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1)) 278 expr(i, src0, src1); 279 else 280 if (i->srcExists(0) && i->src(0).getImmediate(src0)) 281 opnd(i, src0, 0); 282 else 283 if (i->srcExists(1) && i->src(1).getImmediate(src1)) 284 opnd(i, src1, 1); 285 } 286 return true; 287 } 288 289 CmpInstruction * 290 ConstantFolding::findOriginForTestWithZero(Value *value) 291 { 292 if (!value) 293 return NULL; 294 Instruction *insn = value->getInsn(); 295 296 while (insn && insn->op != OP_SET) { 297 Instruction *next = NULL; 298 switch (insn->op) { 299 case OP_NEG: 300 case OP_ABS: 301 case OP_CVT: 302 next = insn->getSrc(0)->getInsn(); 303 if (insn->sType != next->dType) 304 return NULL; 305 break; 306 case OP_MOV: 307 next = insn->getSrc(0)->getInsn(); 308 break; 309 default: 310 return NULL; 311 } 312 insn = next; 313 } 314 return insn ? insn->asCmp() : NULL; 315 } 316 317 void 318 Modifier::applyTo(ImmediateValue& imm) const 319 { 320 switch (imm.reg.type) { 321 case TYPE_F32: 322 if (bits & NV50_IR_MOD_ABS) 323 imm.reg.data.f32 = fabsf(imm.reg.data.f32); 324 if (bits & NV50_IR_MOD_NEG) 325 imm.reg.data.f32 = -imm.reg.data.f32; 326 if (bits & NV50_IR_MOD_SAT) { 327 if (imm.reg.data.f32 < 0.0f) 328 imm.reg.data.f32 = 0.0f; 329 else 330 if (imm.reg.data.f32 > 1.0f) 331 imm.reg.data.f32 = 1.0f; 332 } 333 assert(!(bits & NV50_IR_MOD_NOT)); 334 break; 335 336 case TYPE_S8: // NOTE: will be extended 337 case TYPE_S16: 338 case TYPE_S32: 339 case TYPE_U8: // NOTE: treated as signed 340 case TYPE_U16: 341 case TYPE_U32: 342 if (bits & NV50_IR_MOD_ABS) 343 imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ? 344 imm.reg.data.s32 : -imm.reg.data.s32; 345 if (bits & NV50_IR_MOD_NEG) 346 imm.reg.data.s32 = -imm.reg.data.s32; 347 if (bits & NV50_IR_MOD_NOT) 348 imm.reg.data.s32 = ~imm.reg.data.s32; 349 break; 350 351 case TYPE_F64: 352 if (bits & NV50_IR_MOD_ABS) 353 imm.reg.data.f64 = fabs(imm.reg.data.f64); 354 if (bits & NV50_IR_MOD_NEG) 355 imm.reg.data.f64 = -imm.reg.data.f64; 356 if (bits & NV50_IR_MOD_SAT) { 357 if (imm.reg.data.f64 < 0.0) 358 imm.reg.data.f64 = 0.0; 359 else 360 if (imm.reg.data.f64 > 1.0) 361 imm.reg.data.f64 = 1.0; 362 } 363 assert(!(bits & NV50_IR_MOD_NOT)); 364 break; 365 366 default: 367 assert(!"invalid/unhandled type"); 368 imm.reg.data.u64 = 0; 369 break; 370 } 371 } 372 373 operation 374 Modifier::getOp() const 375 { 376 switch (bits) { 377 case NV50_IR_MOD_ABS: return OP_ABS; 378 case NV50_IR_MOD_NEG: return OP_NEG; 379 case NV50_IR_MOD_SAT: return OP_SAT; 380 case NV50_IR_MOD_NOT: return OP_NOT; 381 case 0: 382 return OP_MOV; 383 default: 384 return OP_CVT; 385 } 386 } 387 388 void 389 ConstantFolding::expr(Instruction *i, 390 ImmediateValue &imm0, ImmediateValue &imm1) 391 { 392 struct Storage *const a = &imm0.reg, *const b = &imm1.reg; 393 struct Storage res; 394 395 memset(&res.data, 0, sizeof(res.data)); 396 397 switch (i->op) { 398 case OP_MAD: 399 case OP_FMA: 400 case OP_MUL: 401 if (i->dnz && i->dType == TYPE_F32) { 402 if (!isfinite(a->data.f32)) 403 a->data.f32 = 0.0f; 404 if (!isfinite(b->data.f32)) 405 b->data.f32 = 0.0f; 406 } 407 switch (i->dType) { 408 case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break; 409 case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break; 410 case TYPE_S32: 411 case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break; 412 default: 413 return; 414 } 415 break; 416 case OP_DIV: 417 if (b->data.u32 == 0) 418 break; 419 switch (i->dType) { 420 case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break; 421 case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break; 422 case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break; 423 case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break; 424 default: 425 return; 426 } 427 break; 428 case OP_ADD: 429 switch (i->dType) { 430 case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break; 431 case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break; 432 case TYPE_S32: 433 case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break; 434 default: 435 return; 436 } 437 break; 438 case OP_POW: 439 switch (i->dType) { 440 case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break; 441 case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break; 442 default: 443 return; 444 } 445 break; 446 case OP_MAX: 447 switch (i->dType) { 448 case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break; 449 case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break; 450 case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break; 451 case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break; 452 default: 453 return; 454 } 455 break; 456 case OP_MIN: 457 switch (i->dType) { 458 case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break; 459 case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break; 460 case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break; 461 case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break; 462 default: 463 return; 464 } 465 break; 466 case OP_AND: 467 res.data.u64 = a->data.u64 & b->data.u64; 468 break; 469 case OP_OR: 470 res.data.u64 = a->data.u64 | b->data.u64; 471 break; 472 case OP_XOR: 473 res.data.u64 = a->data.u64 ^ b->data.u64; 474 break; 475 case OP_SHL: 476 res.data.u32 = a->data.u32 << b->data.u32; 477 break; 478 case OP_SHR: 479 switch (i->dType) { 480 case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break; 481 case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break; 482 default: 483 return; 484 } 485 break; 486 case OP_SLCT: 487 if (a->data.u32 != b->data.u32) 488 return; 489 res.data.u32 = a->data.u32; 490 break; 491 default: 492 return; 493 } 494 ++foldCount; 495 496 i->src(0).mod = Modifier(0); 497 i->src(1).mod = Modifier(0); 498 499 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32)); 500 i->setSrc(1, NULL); 501 502 i->getSrc(0)->reg.data = res.data; 503 504 if (i->op == OP_MAD || i->op == OP_FMA) { 505 i->op = OP_ADD; 506 507 i->setSrc(1, i->getSrc(0)); 508 i->src(1).mod = i->src(2).mod; 509 i->setSrc(0, i->getSrc(2)); 510 i->setSrc(2, NULL); 511 512 ImmediateValue src0; 513 if (i->src(0).getImmediate(src0)) 514 expr(i, src0, *i->getSrc(1)->asImm()); 515 } else { 516 i->op = OP_MOV; 517 } 518 } 519 520 void 521 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm) 522 { 523 Storage res; 524 525 if (i->dType != TYPE_F32) 526 return; 527 switch (i->op) { 528 case OP_NEG: res.data.f32 = -imm.reg.data.f32; break; 529 case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break; 530 case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break; 531 case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break; 532 case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break; 533 case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break; 534 case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break; 535 case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break; 536 case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break; 537 case OP_PRESIN: 538 case OP_PREEX2: 539 // these should be handled in subsequent OP_SIN/COS/EX2 540 res.data.f32 = imm.reg.data.f32; 541 break; 542 default: 543 return; 544 } 545 i->op = OP_MOV; 546 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32)); 547 i->src(0).mod = Modifier(0); 548 } 549 550 void 551 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2, 552 const int s, ImmediateValue& imm2) 553 { 554 const int t = s ? 0 : 1; 555 Instruction *insn; 556 Instruction *mul1 = NULL; // mul1 before mul2 557 int e = 0; 558 float f = imm2.reg.data.f32; 559 ImmediateValue imm1; 560 561 assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32); 562 563 if (mul2->getSrc(t)->refCount() == 1) { 564 insn = mul2->getSrc(t)->getInsn(); 565 if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32) 566 mul1 = insn; 567 if (mul1 && !mul1->saturate) { 568 int s1; 569 570 if (mul1->src(s1 = 0).getImmediate(imm1) || 571 mul1->src(s1 = 1).getImmediate(imm1)) { 572 bld.setPosition(mul1, false); 573 // a = mul r, imm1 574 // d = mul a, imm2 -> d = mul r, (imm1 * imm2) 575 mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32)); 576 mul1->src(s1).mod = Modifier(0); 577 mul2->def(0).replace(mul1->getDef(0), false); 578 } else 579 if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) { 580 // c = mul a, b 581 // d = mul c, imm -> d = mul_x_imm a, b 582 mul1->postFactor = e; 583 mul2->def(0).replace(mul1->getDef(0), false); 584 if (f < 0) 585 mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG); 586 } 587 mul1->saturate = mul2->saturate; 588 return; 589 } 590 } 591 if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) { 592 // b = mul a, imm 593 // d = mul b, c -> d = mul_x_imm a, c 594 int s2, t2; 595 insn = mul2->getDef(0)->uses.front()->getInsn(); 596 if (!insn) 597 return; 598 mul1 = mul2; 599 mul2 = NULL; 600 s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1; 601 t2 = s2 ? 0 : 1; 602 if (insn->op == OP_MUL && insn->dType == TYPE_F32) 603 if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1)) 604 mul2 = insn; 605 if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) { 606 mul2->postFactor = e; 607 mul2->setSrc(s2, mul1->src(t)); 608 if (f < 0) 609 mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG); 610 } 611 } 612 } 613 614 void 615 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) 616 { 617 const int t = !s; 618 const operation op = i->op; 619 620 switch (i->op) { 621 case OP_MUL: 622 if (i->dType == TYPE_F32) 623 tryCollapseChainedMULs(i, s, imm0); 624 625 if (imm0.isInteger(0)) { 626 i->op = OP_MOV; 627 i->setSrc(0, new_ImmediateValue(prog, 0u)); 628 i->src(0).mod = Modifier(0); 629 i->setSrc(1, NULL); 630 } else 631 if (imm0.isInteger(1) || imm0.isInteger(-1)) { 632 if (imm0.isNegative()) 633 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG); 634 i->op = i->src(t).mod.getOp(); 635 if (s == 0) { 636 i->setSrc(0, i->getSrc(1)); 637 i->src(0).mod = i->src(1).mod; 638 i->src(1).mod = 0; 639 } 640 if (i->op != OP_CVT) 641 i->src(0).mod = 0; 642 i->setSrc(1, NULL); 643 } else 644 if (imm0.isInteger(2) || imm0.isInteger(-2)) { 645 if (imm0.isNegative()) 646 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG); 647 i->op = OP_ADD; 648 i->setSrc(s, i->getSrc(t)); 649 i->src(s).mod = i->src(t).mod; 650 } else 651 if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) { 652 i->op = OP_SHL; 653 imm0.applyLog2(); 654 i->setSrc(0, i->getSrc(t)); 655 i->src(0).mod = i->src(t).mod; 656 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32)); 657 i->src(1).mod = 0; 658 } 659 break; 660 case OP_ADD: 661 if (imm0.isInteger(0)) { 662 if (s == 0) { 663 i->setSrc(0, i->getSrc(1)); 664 i->src(0).mod = i->src(1).mod; 665 } 666 i->setSrc(1, NULL); 667 i->op = i->src(0).mod.getOp(); 668 if (i->op != OP_CVT) 669 i->src(0).mod = Modifier(0); 670 } 671 break; 672 673 case OP_DIV: 674 if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32)) 675 break; 676 bld.setPosition(i, false); 677 if (imm0.reg.data.u32 == 0) { 678 break; 679 } else 680 if (imm0.reg.data.u32 == 1) { 681 i->op = OP_MOV; 682 i->setSrc(1, NULL); 683 } else 684 if (i->dType == TYPE_U32 && imm0.isPow2()) { 685 i->op = OP_SHR; 686 i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32))); 687 } else 688 if (i->dType == TYPE_U32) { 689 Instruction *mul; 690 Value *tA, *tB; 691 const uint32_t d = imm0.reg.data.u32; 692 uint32_t m; 693 int r, s; 694 uint32_t l = util_logbase2(d); 695 if (((uint32_t)1 << l) < d) 696 ++l; 697 m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1; 698 r = l ? 1 : 0; 699 s = l ? (l - 1) : 0; 700 701 tA = bld.getSSA(); 702 tB = bld.getSSA(); 703 mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0), 704 bld.loadImm(NULL, m)); 705 mul->subOp = NV50_IR_SUBOP_MUL_HIGH; 706 bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA); 707 tA = bld.getSSA(); 708 if (r) 709 bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r)); 710 else 711 tA = tB; 712 tB = s ? bld.getSSA() : i->getDef(0); 713 bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA); 714 if (s) 715 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s)); 716 717 delete_Instruction(prog, i); 718 } else 719 if (imm0.reg.data.s32 == -1) { 720 i->op = OP_NEG; 721 i->setSrc(1, NULL); 722 } else { 723 LValue *tA, *tB; 724 LValue *tD; 725 const int32_t d = imm0.reg.data.s32; 726 int32_t m; 727 int32_t l = util_logbase2(static_cast<unsigned>(abs(d))); 728 if ((1 << l) < abs(d)) 729 ++l; 730 if (!l) 731 l = 1; 732 m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32); 733 734 tA = bld.getSSA(); 735 tB = bld.getSSA(); 736 bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m), 737 i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH; 738 if (l > 1) 739 bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1)); 740 else 741 tB = tA; 742 tA = bld.getSSA(); 743 bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0)); 744 tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue(); 745 bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA); 746 if (d < 0) 747 bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB); 748 749 delete_Instruction(prog, i); 750 } 751 break; 752 753 case OP_MOD: 754 if (i->sType == TYPE_U32 && imm0.isPow2()) { 755 bld.setPosition(i, false); 756 i->op = OP_AND; 757 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1)); 758 } 759 break; 760 761 case OP_SET: // TODO: SET_AND,OR,XOR 762 { 763 CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t)); 764 CondCode cc, ccZ; 765 if (i->src(t).mod != Modifier(0)) 766 return; 767 if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET) 768 return; 769 cc = si->setCond; 770 ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U); 771 if (s == 0) 772 ccZ = reverseCondCode(ccZ); 773 switch (ccZ) { 774 case CC_LT: cc = CC_FL; break; 775 case CC_GE: cc = CC_TR; break; 776 case CC_EQ: cc = inverseCondCode(cc); break; 777 case CC_LE: cc = inverseCondCode(cc); break; 778 case CC_GT: break; 779 case CC_NE: break; 780 default: 781 return; 782 } 783 i->asCmp()->setCond = cc; 784 i->setSrc(0, si->src(0)); 785 i->setSrc(1, si->src(1)); 786 i->sType = si->sType; 787 } 788 break; 789 790 case OP_SHL: 791 { 792 if (s != 1 || i->src(0).mod != Modifier(0)) 793 break; 794 // try to concatenate shifts 795 Instruction *si = i->getSrc(0)->getInsn(); 796 if (!si || si->op != OP_SHL) 797 break; 798 ImmediateValue imm1; 799 if (si->src(1).getImmediate(imm1)) { 800 bld.setPosition(i, false); 801 i->setSrc(0, si->getSrc(0)); 802 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32)); 803 } 804 } 805 break; 806 807 case OP_ABS: 808 case OP_NEG: 809 case OP_LG2: 810 case OP_RCP: 811 case OP_SQRT: 812 case OP_RSQ: 813 case OP_PRESIN: 814 case OP_SIN: 815 case OP_COS: 816 case OP_PREEX2: 817 case OP_EX2: 818 unary(i, imm0); 819 break; 820 default: 821 return; 822 } 823 if (i->op != op) 824 foldCount++; 825 } 826 827 // ============================================================================= 828 829 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed. 830 class ModifierFolding : public Pass 831 { 832 private: 833 virtual bool visit(BasicBlock *); 834 }; 835 836 bool 837 ModifierFolding::visit(BasicBlock *bb) 838 { 839 const Target *target = prog->getTarget(); 840 841 Instruction *i, *next, *mi; 842 Modifier mod; 843 844 for (i = bb->getEntry(); i; i = next) { 845 next = i->next; 846 847 if (0 && i->op == OP_SUB) { 848 // turn "sub" into "add neg" (do we really want this ?) 849 i->op = OP_ADD; 850 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG); 851 } 852 853 for (int s = 0; s < 3 && i->srcExists(s); ++s) { 854 mi = i->getSrc(s)->getInsn(); 855 if (!mi || 856 mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8) 857 continue; 858 if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) { 859 if ((i->op != OP_ADD && 860 i->op != OP_MUL) || 861 (mi->op != OP_ABS && 862 mi->op != OP_NEG)) 863 continue; 864 } else 865 if (i->sType != mi->dType) { 866 continue; 867 } 868 if ((mod = Modifier(mi->op)) == Modifier(0)) 869 continue; 870 mod *= mi->src(0).mod; 871 872 if ((i->op == OP_ABS) || i->src(s).mod.abs()) { 873 // abs neg [abs] = abs 874 mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS)); 875 } else 876 if ((i->op == OP_NEG) && mod.neg()) { 877 assert(s == 0); 878 // neg as both opcode and modifier on same insn is prohibited 879 // neg neg abs = abs, neg neg = identity 880 mod = mod & Modifier(~NV50_IR_MOD_NEG); 881 i->op = mod.getOp(); 882 mod = mod & Modifier(~NV50_IR_MOD_ABS); 883 if (mod == Modifier(0)) 884 i->op = OP_MOV; 885 } 886 887 if (target->isModSupported(i, s, mod)) { 888 i->setSrc(s, mi->getSrc(0)); 889 i->src(s).mod *= mod; 890 } 891 } 892 893 if (i->op == OP_SAT) { 894 mi = i->getSrc(0)->getInsn(); 895 if (mi && 896 mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) { 897 mi->saturate = 1; 898 mi->setDef(0, i->getDef(0)); 899 delete_Instruction(prog, i); 900 } 901 } 902 } 903 904 return true; 905 } 906 907 // ============================================================================= 908 909 // MUL + ADD -> MAD/FMA 910 // MIN/MAX(a, a) -> a, etc. 911 // SLCT(a, b, const) -> cc(const) ? a : b 912 // RCP(RCP(a)) -> a 913 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b) 914 class AlgebraicOpt : public Pass 915 { 916 private: 917 virtual bool visit(BasicBlock *); 918 919 void handleABS(Instruction *); 920 bool handleADD(Instruction *); 921 bool tryADDToMADOrSAD(Instruction *, operation toOp); 922 void handleMINMAX(Instruction *); 923 void handleRCP(Instruction *); 924 void handleSLCT(Instruction *); 925 void handleLOGOP(Instruction *); 926 void handleCVT(Instruction *); 927 928 BuildUtil bld; 929 }; 930 931 void 932 AlgebraicOpt::handleABS(Instruction *abs) 933 { 934 Instruction *sub = abs->getSrc(0)->getInsn(); 935 DataType ty; 936 if (!sub || 937 !prog->getTarget()->isOpSupported(OP_SAD, abs->dType)) 938 return; 939 // expect not to have mods yet, if we do, bail 940 if (sub->src(0).mod || sub->src(1).mod) 941 return; 942 // hidden conversion ? 943 ty = intTypeToSigned(sub->dType); 944 if (abs->dType != abs->sType || ty != abs->sType) 945 return; 946 947 if ((sub->op != OP_ADD && sub->op != OP_SUB) || 948 sub->src(0).getFile() != FILE_GPR || sub->src(0).mod || 949 sub->src(1).getFile() != FILE_GPR || sub->src(1).mod) 950 return; 951 952 Value *src0 = sub->getSrc(0); 953 Value *src1 = sub->getSrc(1); 954 955 if (sub->op == OP_ADD) { 956 Instruction *neg = sub->getSrc(1)->getInsn(); 957 if (neg && neg->op != OP_NEG) { 958 neg = sub->getSrc(0)->getInsn(); 959 src0 = sub->getSrc(1); 960 } 961 if (!neg || neg->op != OP_NEG || 962 neg->dType != neg->sType || neg->sType != ty) 963 return; 964 src1 = neg->getSrc(0); 965 } 966 967 // found ABS(SUB)) 968 abs->moveSources(1, 2); // move sources >=1 up by 2 969 abs->op = OP_SAD; 970 abs->setType(sub->dType); 971 abs->setSrc(0, src0); 972 abs->setSrc(1, src1); 973 bld.setPosition(abs, false); 974 abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0)); 975 } 976 977 bool 978 AlgebraicOpt::handleADD(Instruction *add) 979 { 980 Value *src0 = add->getSrc(0); 981 Value *src1 = add->getSrc(1); 982 983 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) 984 return false; 985 986 bool changed = false; 987 if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType)) 988 changed = tryADDToMADOrSAD(add, OP_MAD); 989 if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType)) 990 changed = tryADDToMADOrSAD(add, OP_SAD); 991 return changed; 992 } 993 994 // ADD(SAD(a,b,0), c) -> SAD(a,b,c) 995 // ADD(MUL(a,b), c) -> MAD(a,b,c) 996 bool 997 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp) 998 { 999 Value *src0 = add->getSrc(0); 1000 Value *src1 = add->getSrc(1); 1001 Value *src; 1002 int s; 1003 const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL; 1004 const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0)); 1005 Modifier mod[4]; 1006 1007 if (src0->refCount() == 1 && 1008 src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp) 1009 s = 0; 1010 else 1011 if (src1->refCount() == 1 && 1012 src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp) 1013 s = 1; 1014 else 1015 return false; 1016 1017 if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) || 1018 (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb)) 1019 return false; 1020 1021 src = add->getSrc(s); 1022 1023 if (src->getInsn()->postFactor) 1024 return false; 1025 if (toOp == OP_SAD) { 1026 ImmediateValue imm; 1027 if (!src->getInsn()->src(2).getImmediate(imm)) 1028 return false; 1029 if (!imm.isInteger(0)) 1030 return false; 1031 } 1032 1033 mod[0] = add->src(0).mod; 1034 mod[1] = add->src(1).mod; 1035 mod[2] = src->getUniqueInsn()->src(0).mod; 1036 mod[3] = src->getUniqueInsn()->src(1).mod; 1037 1038 if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad) 1039 return false; 1040 1041 add->op = toOp; 1042 add->subOp = src->getInsn()->subOp; // potentially mul-high 1043 1044 add->setSrc(2, add->src(s ? 0 : 1)); 1045 1046 add->setSrc(0, src->getInsn()->getSrc(0)); 1047 add->src(0).mod = mod[2] ^ mod[s]; 1048 add->setSrc(1, src->getInsn()->getSrc(1)); 1049 add->src(1).mod = mod[3]; 1050 1051 return true; 1052 } 1053 1054 void 1055 AlgebraicOpt::handleMINMAX(Instruction *minmax) 1056 { 1057 Value *src0 = minmax->getSrc(0); 1058 Value *src1 = minmax->getSrc(1); 1059 1060 if (src0 != src1 || src0->reg.file != FILE_GPR) 1061 return; 1062 if (minmax->src(0).mod == minmax->src(1).mod) { 1063 if (minmax->def(0).mayReplace(minmax->src(0))) { 1064 minmax->def(0).replace(minmax->src(0), false); 1065 minmax->bb->remove(minmax); 1066 } else { 1067 minmax->op = OP_CVT; 1068 minmax->setSrc(1, NULL); 1069 } 1070 } else { 1071 // TODO: 1072 // min(x, -x) = -abs(x) 1073 // min(x, -abs(x)) = -abs(x) 1074 // min(x, abs(x)) = x 1075 // max(x, -abs(x)) = x 1076 // max(x, abs(x)) = abs(x) 1077 // max(x, -x) = abs(x) 1078 } 1079 } 1080 1081 void 1082 AlgebraicOpt::handleRCP(Instruction *rcp) 1083 { 1084 Instruction *si = rcp->getSrc(0)->getUniqueInsn(); 1085 1086 if (si && si->op == OP_RCP) { 1087 Modifier mod = rcp->src(0).mod * si->src(0).mod; 1088 rcp->op = mod.getOp(); 1089 rcp->setSrc(0, si->getSrc(0)); 1090 } 1091 } 1092 1093 void 1094 AlgebraicOpt::handleSLCT(Instruction *slct) 1095 { 1096 if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) { 1097 if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f)) 1098 slct->setSrc(0, slct->getSrc(1)); 1099 } else 1100 if (slct->getSrc(0) != slct->getSrc(1)) { 1101 return; 1102 } 1103 slct->op = OP_MOV; 1104 slct->setSrc(1, NULL); 1105 slct->setSrc(2, NULL); 1106 } 1107 1108 void 1109 AlgebraicOpt::handleLOGOP(Instruction *logop) 1110 { 1111 Value *src0 = logop->getSrc(0); 1112 Value *src1 = logop->getSrc(1); 1113 1114 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) 1115 return; 1116 1117 if (src0 == src1) { 1118 if ((logop->op == OP_AND || logop->op == OP_OR) && 1119 logop->def(0).mayReplace(logop->src(0))) { 1120 logop->def(0).replace(logop->src(0), false); 1121 delete_Instruction(prog, logop); 1122 } 1123 } else { 1124 // try AND(SET, SET) -> SET_AND(SET) 1125 Instruction *set0 = src0->getInsn(); 1126 Instruction *set1 = src1->getInsn(); 1127 1128 if (!set0 || set0->fixed || !set1 || set1->fixed) 1129 return; 1130 if (set1->op != OP_SET) { 1131 Instruction *xchg = set0; 1132 set0 = set1; 1133 set1 = xchg; 1134 if (set1->op != OP_SET) 1135 return; 1136 } 1137 operation redOp = (logop->op == OP_AND ? OP_SET_AND : 1138 logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR); 1139 if (!prog->getTarget()->isOpSupported(redOp, set1->sType)) 1140 return; 1141 if (set0->op != OP_SET && 1142 set0->op != OP_SET_AND && 1143 set0->op != OP_SET_OR && 1144 set0->op != OP_SET_XOR) 1145 return; 1146 if (set0->getDef(0)->refCount() > 1 && 1147 set1->getDef(0)->refCount() > 1) 1148 return; 1149 if (set0->getPredicate() || set1->getPredicate()) 1150 return; 1151 // check that they don't source each other 1152 for (int s = 0; s < 2; ++s) 1153 if (set0->getSrc(s) == set1->getDef(0) || 1154 set1->getSrc(s) == set0->getDef(0)) 1155 return; 1156 1157 set0 = cloneForward(func, set0); 1158 set1 = cloneShallow(func, set1); 1159 logop->bb->insertAfter(logop, set1); 1160 logop->bb->insertAfter(logop, set0); 1161 1162 set0->dType = TYPE_U8; 1163 set0->getDef(0)->reg.file = FILE_PREDICATE; 1164 set0->getDef(0)->reg.size = 1; 1165 set1->setSrc(2, set0->getDef(0)); 1166 set1->op = redOp; 1167 set1->setDef(0, logop->getDef(0)); 1168 delete_Instruction(prog, logop); 1169 } 1170 } 1171 1172 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0 1173 // nv50: 1174 // F2I(NEG(I2F(ABS(SET)))) 1175 void 1176 AlgebraicOpt::handleCVT(Instruction *cvt) 1177 { 1178 if (cvt->sType != TYPE_F32 || 1179 cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0)) 1180 return; 1181 Instruction *insn = cvt->getSrc(0)->getInsn(); 1182 if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32) 1183 return; 1184 if (insn->src(0).mod != Modifier(0)) 1185 return; 1186 insn = insn->getSrc(0)->getInsn(); 1187 1188 // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET 1189 if (insn && insn->op == OP_CVT && 1190 insn->dType == TYPE_F32 && 1191 insn->sType == TYPE_S32) { 1192 insn = insn->getSrc(0)->getInsn(); 1193 if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 || 1194 insn->src(0).mod) 1195 return; 1196 insn = insn->getSrc(0)->getInsn(); 1197 if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32) 1198 return; 1199 } else 1200 if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) { 1201 return; 1202 } 1203 1204 Instruction *bset = cloneShallow(func, insn); 1205 bset->dType = TYPE_U32; 1206 bset->setDef(0, cvt->getDef(0)); 1207 cvt->bb->insertAfter(cvt, bset); 1208 delete_Instruction(prog, cvt); 1209 } 1210 1211 bool 1212 AlgebraicOpt::visit(BasicBlock *bb) 1213 { 1214 Instruction *next; 1215 for (Instruction *i = bb->getEntry(); i; i = next) { 1216 next = i->next; 1217 switch (i->op) { 1218 case OP_ABS: 1219 handleABS(i); 1220 break; 1221 case OP_ADD: 1222 handleADD(i); 1223 break; 1224 case OP_RCP: 1225 handleRCP(i); 1226 break; 1227 case OP_MIN: 1228 case OP_MAX: 1229 handleMINMAX(i); 1230 break; 1231 case OP_SLCT: 1232 handleSLCT(i); 1233 break; 1234 case OP_AND: 1235 case OP_OR: 1236 case OP_XOR: 1237 handleLOGOP(i); 1238 break; 1239 case OP_CVT: 1240 handleCVT(i); 1241 break; 1242 default: 1243 break; 1244 } 1245 } 1246 1247 return true; 1248 } 1249 1250 // ============================================================================= 1251 1252 static inline void 1253 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn) 1254 { 1255 if (offset != ldst->getSrc(0)->reg.data.offset) { 1256 if (ldst->getSrc(0)->refCount() > 1) 1257 ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0))); 1258 ldst->getSrc(0)->reg.data.offset = offset; 1259 } 1260 } 1261 1262 // Combine loads and stores, forward stores to loads where possible. 1263 class MemoryOpt : public Pass 1264 { 1265 private: 1266 class Record 1267 { 1268 public: 1269 Record *next; 1270 Instruction *insn; 1271 const Value *rel[2]; 1272 const Value *base; 1273 int32_t offset; 1274 int8_t fileIndex; 1275 uint8_t size; 1276 bool locked; 1277 Record *prev; 1278 1279 bool overlaps(const Instruction *ldst) const; 1280 1281 inline void link(Record **); 1282 inline void unlink(Record **); 1283 inline void set(const Instruction *ldst); 1284 }; 1285 1286 public: 1287 MemoryOpt(); 1288 1289 Record *loads[DATA_FILE_COUNT]; 1290 Record *stores[DATA_FILE_COUNT]; 1291 1292 MemoryPool recordPool; 1293 1294 private: 1295 virtual bool visit(BasicBlock *); 1296 bool runOpt(BasicBlock *); 1297 1298 Record **getList(const Instruction *); 1299 1300 Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const; 1301 1302 // merge @insn into load/store instruction from @rec 1303 bool combineLd(Record *rec, Instruction *ld); 1304 bool combineSt(Record *rec, Instruction *st); 1305 1306 bool replaceLdFromLd(Instruction *ld, Record *ldRec); 1307 bool replaceLdFromSt(Instruction *ld, Record *stRec); 1308 bool replaceStFromSt(Instruction *restrict st, Record *stRec); 1309 1310 void addRecord(Instruction *ldst); 1311 void purgeRecords(Instruction *const st, DataFile); 1312 void lockStores(Instruction *const ld); 1313 void reset(); 1314 1315 private: 1316 Record *prevRecord; 1317 }; 1318 1319 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6) 1320 { 1321 for (int i = 0; i < DATA_FILE_COUNT; ++i) { 1322 loads[i] = NULL; 1323 stores[i] = NULL; 1324 } 1325 prevRecord = NULL; 1326 } 1327 1328 void 1329 MemoryOpt::reset() 1330 { 1331 for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) { 1332 Record *it, *next; 1333 for (it = loads[i]; it; it = next) { 1334 next = it->next; 1335 recordPool.release(it); 1336 } 1337 loads[i] = NULL; 1338 for (it = stores[i]; it; it = next) { 1339 next = it->next; 1340 recordPool.release(it); 1341 } 1342 stores[i] = NULL; 1343 } 1344 } 1345 1346 bool 1347 MemoryOpt::combineLd(Record *rec, Instruction *ld) 1348 { 1349 int32_t offRc = rec->offset; 1350 int32_t offLd = ld->getSrc(0)->reg.data.offset; 1351 int sizeRc = rec->size; 1352 int sizeLd = typeSizeof(ld->dType); 1353 int size = sizeRc + sizeLd; 1354 int d, j; 1355 1356 if (!prog->getTarget()-> 1357 isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size))) 1358 return false; 1359 // no unaligned loads 1360 if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) || 1361 ((size == 0xc) && (MIN2(offLd, offRc) & 0xf))) 1362 return false; 1363 1364 assert(sizeRc + sizeLd <= 16 && offRc != offLd); 1365 1366 for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j); 1367 1368 if (offLd < offRc) { 1369 int sz; 1370 for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d); 1371 // d: nr of definitions in ld 1372 // j: nr of definitions in rec->insn, move: 1373 for (d = d + j - 1; j > 0; --j, --d) 1374 rec->insn->setDef(d, rec->insn->getDef(j - 1)); 1375 1376 if (rec->insn->getSrc(0)->refCount() > 1) 1377 rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0))); 1378 rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd; 1379 1380 d = 0; 1381 } else { 1382 d = j; 1383 } 1384 // move definitions of @ld to @rec->insn 1385 for (j = 0; sizeLd; ++j, ++d) { 1386 sizeLd -= ld->getDef(j)->reg.size; 1387 rec->insn->setDef(d, ld->getDef(j)); 1388 } 1389 1390 rec->size = size; 1391 rec->insn->getSrc(0)->reg.size = size; 1392 rec->insn->setType(typeOfSize(size)); 1393 1394 delete_Instruction(prog, ld); 1395 1396 return true; 1397 } 1398 1399 bool 1400 MemoryOpt::combineSt(Record *rec, Instruction *st) 1401 { 1402 int32_t offRc = rec->offset; 1403 int32_t offSt = st->getSrc(0)->reg.data.offset; 1404 int sizeRc = rec->size; 1405 int sizeSt = typeSizeof(st->dType); 1406 int s = sizeSt / 4; 1407 int size = sizeRc + sizeSt; 1408 int j, k; 1409 Value *src[4]; // no modifiers in ValueRef allowed for st 1410 Value *extra[3]; 1411 1412 if (!prog->getTarget()-> 1413 isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size))) 1414 return false; 1415 if (size == 8 && MIN2(offRc, offSt) & 0x7) 1416 return false; 1417 1418 st->takeExtraSources(0, extra); // save predicate and indirect address 1419 1420 if (offRc < offSt) { 1421 // save values from @st 1422 for (s = 0; sizeSt; ++s) { 1423 sizeSt -= st->getSrc(s + 1)->reg.size; 1424 src[s] = st->getSrc(s + 1); 1425 } 1426 // set record's values as low sources of @st 1427 for (j = 1; sizeRc; ++j) { 1428 sizeRc -= rec->insn->getSrc(j)->reg.size; 1429 st->setSrc(j, rec->insn->getSrc(j)); 1430 } 1431 // set saved values as high sources of @st 1432 for (k = j, j = 0; j < s; ++j) 1433 st->setSrc(k++, src[j]); 1434 1435 updateLdStOffset(st, offRc, func); 1436 } else { 1437 for (j = 1; sizeSt; ++j) 1438 sizeSt -= st->getSrc(j)->reg.size; 1439 for (s = 1; sizeRc; ++j, ++s) { 1440 sizeRc -= rec->insn->getSrc(s)->reg.size; 1441 st->setSrc(j, rec->insn->getSrc(s)); 1442 } 1443 rec->offset = offSt; 1444 } 1445 st->putExtraSources(0, extra); // restore pointer and predicate 1446 1447 delete_Instruction(prog, rec->insn); 1448 rec->insn = st; 1449 rec->size = size; 1450 rec->insn->getSrc(0)->reg.size = size; 1451 rec->insn->setType(typeOfSize(size)); 1452 return true; 1453 } 1454 1455 void 1456 MemoryOpt::Record::set(const Instruction *ldst) 1457 { 1458 const Symbol *mem = ldst->getSrc(0)->asSym(); 1459 fileIndex = mem->reg.fileIndex; 1460 rel[0] = ldst->getIndirect(0, 0); 1461 rel[1] = ldst->getIndirect(0, 1); 1462 offset = mem->reg.data.offset; 1463 base = mem->getBase(); 1464 size = typeSizeof(ldst->sType); 1465 } 1466 1467 void 1468 MemoryOpt::Record::link(Record **list) 1469 { 1470 next = *list; 1471 if (next) 1472 next->prev = this; 1473 prev = NULL; 1474 *list = this; 1475 } 1476 1477 void 1478 MemoryOpt::Record::unlink(Record **list) 1479 { 1480 if (next) 1481 next->prev = prev; 1482 if (prev) 1483 prev->next = next; 1484 else 1485 *list = next; 1486 } 1487 1488 MemoryOpt::Record ** 1489 MemoryOpt::getList(const Instruction *insn) 1490 { 1491 if (insn->op == OP_LOAD || insn->op == OP_VFETCH) 1492 return &loads[insn->src(0).getFile()]; 1493 return &stores[insn->src(0).getFile()]; 1494 } 1495 1496 void 1497 MemoryOpt::addRecord(Instruction *i) 1498 { 1499 Record **list = getList(i); 1500 Record *it = reinterpret_cast<Record *>(recordPool.allocate()); 1501 1502 it->link(list); 1503 it->set(i); 1504 it->insn = i; 1505 it->locked = false; 1506 } 1507 1508 MemoryOpt::Record * 1509 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const 1510 { 1511 const Symbol *sym = insn->getSrc(0)->asSym(); 1512 const int size = typeSizeof(insn->sType); 1513 Record *rec = NULL; 1514 Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file]; 1515 1516 for (; it; it = it->next) { 1517 if (it->locked && insn->op != OP_LOAD) 1518 continue; 1519 if ((it->offset >> 4) != (sym->reg.data.offset >> 4) || 1520 it->rel[0] != insn->getIndirect(0, 0) || 1521 it->fileIndex != sym->reg.fileIndex || 1522 it->rel[1] != insn->getIndirect(0, 1)) 1523 continue; 1524 1525 if (it->offset < sym->reg.data.offset) { 1526 if (it->offset + it->size >= sym->reg.data.offset) { 1527 isAdj = (it->offset + it->size == sym->reg.data.offset); 1528 if (!isAdj) 1529 return it; 1530 if (!(it->offset & 0x7)) 1531 rec = it; 1532 } 1533 } else { 1534 isAdj = it->offset != sym->reg.data.offset; 1535 if (size <= it->size && !isAdj) 1536 return it; 1537 else 1538 if (!(sym->reg.data.offset & 0x7)) 1539 if (it->offset - size <= sym->reg.data.offset) 1540 rec = it; 1541 } 1542 } 1543 return rec; 1544 } 1545 1546 bool 1547 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec) 1548 { 1549 Instruction *st = rec->insn; 1550 int32_t offSt = rec->offset; 1551 int32_t offLd = ld->getSrc(0)->reg.data.offset; 1552 int d, s; 1553 1554 for (s = 1; offSt != offLd && st->srcExists(s); ++s) 1555 offSt += st->getSrc(s)->reg.size; 1556 if (offSt != offLd) 1557 return false; 1558 1559 for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) { 1560 if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size) 1561 return false; 1562 if (st->getSrc(s)->reg.file != FILE_GPR) 1563 return false; 1564 ld->def(d).replace(st->src(s), false); 1565 } 1566 ld->bb->remove(ld); 1567 return true; 1568 } 1569 1570 bool 1571 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec) 1572 { 1573 Instruction *ldR = rec->insn; 1574 int32_t offR = rec->offset; 1575 int32_t offE = ldE->getSrc(0)->reg.data.offset; 1576 int dR, dE; 1577 1578 assert(offR <= offE); 1579 for (dR = 0; offR < offE && ldR->defExists(dR); ++dR) 1580 offR += ldR->getDef(dR)->reg.size; 1581 if (offR != offE) 1582 return false; 1583 1584 for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) { 1585 if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size) 1586 return false; 1587 ldE->def(dE).replace(ldR->getDef(dR), false); 1588 } 1589 1590 delete_Instruction(prog, ldE); 1591 return true; 1592 } 1593 1594 bool 1595 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec) 1596 { 1597 const Instruction *const ri = rec->insn; 1598 Value *extra[3]; 1599 1600 int32_t offS = st->getSrc(0)->reg.data.offset; 1601 int32_t offR = rec->offset; 1602 int32_t endS = offS + typeSizeof(st->dType); 1603 int32_t endR = offR + typeSizeof(ri->dType); 1604 1605 rec->size = MAX2(endS, endR) - MIN2(offS, offR); 1606 1607 st->takeExtraSources(0, extra); 1608 1609 if (offR < offS) { 1610 Value *vals[10]; 1611 int s, n; 1612 int k = 0; 1613 // get non-replaced sources of ri 1614 for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s) 1615 vals[k++] = ri->getSrc(s); 1616 n = s; 1617 // get replaced sources of st 1618 for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s) 1619 vals[k++] = st->getSrc(s); 1620 // skip replaced sources of ri 1621 for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s); 1622 // get non-replaced sources after values covered by st 1623 for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s) 1624 vals[k++] = ri->getSrc(s); 1625 assert((unsigned int)k <= Elements(vals)); 1626 for (s = 0; s < k; ++s) 1627 st->setSrc(s + 1, vals[s]); 1628 st->setSrc(0, ri->getSrc(0)); 1629 } else 1630 if (endR > endS) { 1631 int j, s; 1632 for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size); 1633 for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size); 1634 for (; offR < endR; offR += ri->getSrc(j++)->reg.size) 1635 st->setSrc(s++, ri->getSrc(j)); 1636 } 1637 st->putExtraSources(0, extra); 1638 1639 delete_Instruction(prog, rec->insn); 1640 1641 rec->insn = st; 1642 rec->offset = st->getSrc(0)->reg.data.offset; 1643 1644 st->setType(typeOfSize(rec->size)); 1645 1646 return true; 1647 } 1648 1649 bool 1650 MemoryOpt::Record::overlaps(const Instruction *ldst) const 1651 { 1652 Record that; 1653 that.set(ldst); 1654 1655 if (this->fileIndex != that.fileIndex) 1656 return false; 1657 1658 if (this->rel[0] || that.rel[0]) 1659 return this->base == that.base; 1660 return 1661 (this->offset < that.offset + that.size) && 1662 (this->offset + this->size > that.offset); 1663 } 1664 1665 // We must not eliminate stores that affect the result of @ld if 1666 // we find later stores to the same location, and we may no longer 1667 // merge them with later stores. 1668 // The stored value can, however, still be used to determine the value 1669 // returned by future loads. 1670 void 1671 MemoryOpt::lockStores(Instruction *const ld) 1672 { 1673 for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next) 1674 if (!r->locked && r->overlaps(ld)) 1675 r->locked = true; 1676 } 1677 1678 // Prior loads from the location of @st are no longer valid. 1679 // Stores to the location of @st may no longer be used to derive 1680 // the value at it nor be coalesced into later stores. 1681 void 1682 MemoryOpt::purgeRecords(Instruction *const st, DataFile f) 1683 { 1684 if (st) 1685 f = st->src(0).getFile(); 1686 1687 for (Record *r = loads[f]; r; r = r->next) 1688 if (!st || r->overlaps(st)) 1689 r->unlink(&loads[f]); 1690 1691 for (Record *r = stores[f]; r; r = r->next) 1692 if (!st || r->overlaps(st)) 1693 r->unlink(&stores[f]); 1694 } 1695 1696 bool 1697 MemoryOpt::visit(BasicBlock *bb) 1698 { 1699 bool ret = runOpt(bb); 1700 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st 1701 // where 96 bit memory operations are forbidden. 1702 if (ret) 1703 ret = runOpt(bb); 1704 return ret; 1705 } 1706 1707 bool 1708 MemoryOpt::runOpt(BasicBlock *bb) 1709 { 1710 Instruction *ldst, *next; 1711 Record *rec; 1712 bool isAdjacent = true; 1713 1714 for (ldst = bb->getEntry(); ldst; ldst = next) { 1715 bool keep = true; 1716 bool isLoad = true; 1717 next = ldst->next; 1718 1719 if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) { 1720 if (ldst->isDead()) { 1721 // might have been produced by earlier optimization 1722 delete_Instruction(prog, ldst); 1723 continue; 1724 } 1725 } else 1726 if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) { 1727 isLoad = false; 1728 } else { 1729 // TODO: maybe have all fixed ops act as barrier ? 1730 if (ldst->op == OP_CALL) { 1731 purgeRecords(NULL, FILE_MEMORY_LOCAL); 1732 purgeRecords(NULL, FILE_MEMORY_GLOBAL); 1733 purgeRecords(NULL, FILE_MEMORY_SHARED); 1734 purgeRecords(NULL, FILE_SHADER_OUTPUT); 1735 } else 1736 if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) { 1737 purgeRecords(NULL, FILE_SHADER_OUTPUT); 1738 } 1739 continue; 1740 } 1741 if (ldst->getPredicate()) // TODO: handle predicated ld/st 1742 continue; 1743 1744 if (isLoad) { 1745 DataFile file = ldst->src(0).getFile(); 1746 1747 // if ld l[]/g[] look for previous store to eliminate the reload 1748 if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) { 1749 // TODO: shared memory ? 1750 rec = findRecord(ldst, false, isAdjacent); 1751 if (rec && !isAdjacent) 1752 keep = !replaceLdFromSt(ldst, rec); 1753 } 1754 1755 // or look for ld from the same location and replace this one 1756 rec = keep ? findRecord(ldst, true, isAdjacent) : NULL; 1757 if (rec) { 1758 if (!isAdjacent) 1759 keep = !replaceLdFromLd(ldst, rec); 1760 else 1761 // or combine a previous load with this one 1762 keep = !combineLd(rec, ldst); 1763 } 1764 if (keep) 1765 lockStores(ldst); 1766 } else { 1767 rec = findRecord(ldst, false, isAdjacent); 1768 if (rec) { 1769 if (!isAdjacent) 1770 keep = !replaceStFromSt(ldst, rec); 1771 else 1772 keep = !combineSt(rec, ldst); 1773 } 1774 if (keep) 1775 purgeRecords(ldst, DATA_FILE_COUNT); 1776 } 1777 if (keep) 1778 addRecord(ldst); 1779 } 1780 reset(); 1781 1782 return true; 1783 } 1784 1785 // ============================================================================= 1786 1787 // Turn control flow into predicated instructions (after register allocation !). 1788 // TODO: 1789 // Could move this to before register allocation on NVC0 and also handle nested 1790 // constructs. 1791 class FlatteningPass : public Pass 1792 { 1793 private: 1794 virtual bool visit(BasicBlock *); 1795 1796 bool tryPredicateConditional(BasicBlock *); 1797 void predicateInstructions(BasicBlock *, Value *pred, CondCode cc); 1798 void tryPropagateBranch(BasicBlock *); 1799 inline bool isConstantCondition(Value *pred); 1800 inline bool mayPredicate(const Instruction *, const Value *pred) const; 1801 inline void removeFlow(Instruction *); 1802 }; 1803 1804 bool 1805 FlatteningPass::isConstantCondition(Value *pred) 1806 { 1807 Instruction *insn = pred->getUniqueInsn(); 1808 assert(insn); 1809 if (insn->op != OP_SET || insn->srcExists(2)) 1810 return false; 1811 1812 for (int s = 0; s < 2 && insn->srcExists(s); ++s) { 1813 Instruction *ld = insn->getSrc(s)->getUniqueInsn(); 1814 DataFile file; 1815 if (ld) { 1816 if (ld->op != OP_MOV && ld->op != OP_LOAD) 1817 return false; 1818 if (ld->src(0).isIndirect(0)) 1819 return false; 1820 file = ld->src(0).getFile(); 1821 } else { 1822 file = insn->src(s).getFile(); 1823 // catch $r63 on NVC0 1824 if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR) 1825 file = FILE_IMMEDIATE; 1826 } 1827 if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST) 1828 return false; 1829 } 1830 return true; 1831 } 1832 1833 void 1834 FlatteningPass::removeFlow(Instruction *insn) 1835 { 1836 FlowInstruction *term = insn ? insn->asFlow() : NULL; 1837 if (!term) 1838 return; 1839 Graph::Edge::Type ty = term->bb->cfg.outgoing().getType(); 1840 1841 if (term->op == OP_BRA) { 1842 // TODO: this might get more difficult when we get arbitrary BRAs 1843 if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK) 1844 return; 1845 } else 1846 if (term->op != OP_JOIN) 1847 return; 1848 1849 Value *pred = term->getPredicate(); 1850 1851 delete_Instruction(prog, term); 1852 1853 if (pred && pred->refCount() == 0) { 1854 Instruction *pSet = pred->getUniqueInsn(); 1855 pred->join->reg.data.id = -1; // deallocate 1856 if (pSet->isDead()) 1857 delete_Instruction(prog, pSet); 1858 } 1859 } 1860 1861 void 1862 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc) 1863 { 1864 for (Instruction *i = bb->getEntry(); i; i = i->next) { 1865 if (i->isNop()) 1866 continue; 1867 assert(!i->getPredicate()); 1868 i->setPredicate(cc, pred); 1869 } 1870 removeFlow(bb->getExit()); 1871 } 1872 1873 bool 1874 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const 1875 { 1876 if (insn->isPseudo()) 1877 return true; 1878 // TODO: calls where we don't know which registers are modified 1879 1880 if (!prog->getTarget()->mayPredicate(insn, pred)) 1881 return false; 1882 for (int d = 0; insn->defExists(d); ++d) 1883 if (insn->getDef(d)->equals(pred)) 1884 return false; 1885 return true; 1886 } 1887 1888 // If we conditionally skip over or to a branch instruction, replace it. 1889 // NOTE: We do not update the CFG anymore here ! 1890 void 1891 FlatteningPass::tryPropagateBranch(BasicBlock *bb) 1892 { 1893 BasicBlock *bf = NULL; 1894 unsigned int i; 1895 1896 if (bb->cfg.outgoingCount() != 2) 1897 return; 1898 if (!bb->getExit() || bb->getExit()->op != OP_BRA) 1899 return; 1900 Graph::EdgeIterator ei = bb->cfg.outgoing(); 1901 1902 for (i = 0; !ei.end(); ++i, ei.next()) { 1903 bf = BasicBlock::get(ei.getNode()); 1904 if (bf->getInsnCount() == 1) 1905 break; 1906 } 1907 if (ei.end() || !bf->getExit()) 1908 return; 1909 FlowInstruction *bra = bb->getExit()->asFlow(); 1910 FlowInstruction *rep = bf->getExit()->asFlow(); 1911 1912 if (rep->getPredicate()) 1913 return; 1914 if (rep->op != OP_BRA && 1915 rep->op != OP_JOIN && 1916 rep->op != OP_EXIT) 1917 return; 1918 1919 bra->op = rep->op; 1920 bra->target.bb = rep->target.bb; 1921 if (i) // 2nd out block means branch not taken 1922 bra->cc = inverseCondCode(bra->cc); 1923 bf->remove(rep); 1924 } 1925 1926 bool 1927 FlatteningPass::visit(BasicBlock *bb) 1928 { 1929 if (tryPredicateConditional(bb)) 1930 return true; 1931 1932 // try to attach join to previous instruction 1933 Instruction *insn = bb->getExit(); 1934 if (insn && insn->op == OP_JOIN && !insn->getPredicate()) { 1935 insn = insn->prev; 1936 if (insn && !insn->getPredicate() && 1937 !insn->asFlow() && 1938 insn->op != OP_TEXBAR && 1939 !isTextureOp(insn->op) && // probably just nve4 1940 insn->op != OP_LINTERP && // probably just nve4 1941 insn->op != OP_PINTERP && // probably just nve4 1942 ((insn->op != OP_LOAD && insn->op != OP_STORE) || 1943 typeSizeof(insn->dType) <= 4) && 1944 !insn->isNop()) { 1945 insn->join = 1; 1946 bb->remove(bb->getExit()); 1947 return true; 1948 } 1949 } 1950 1951 tryPropagateBranch(bb); 1952 1953 return true; 1954 } 1955 1956 bool 1957 FlatteningPass::tryPredicateConditional(BasicBlock *bb) 1958 { 1959 BasicBlock *bL = NULL, *bR = NULL; 1960 unsigned int nL = 0, nR = 0, limit = 12; 1961 Instruction *insn; 1962 unsigned int mask; 1963 1964 mask = bb->initiatesSimpleConditional(); 1965 if (!mask) 1966 return false; 1967 1968 assert(bb->getExit()); 1969 Value *pred = bb->getExit()->getPredicate(); 1970 assert(pred); 1971 1972 if (isConstantCondition(pred)) 1973 limit = 4; 1974 1975 Graph::EdgeIterator ei = bb->cfg.outgoing(); 1976 1977 if (mask & 1) { 1978 bL = BasicBlock::get(ei.getNode()); 1979 for (insn = bL->getEntry(); insn; insn = insn->next, ++nL) 1980 if (!mayPredicate(insn, pred)) 1981 return false; 1982 if (nL > limit) 1983 return false; // too long, do a real branch 1984 } 1985 ei.next(); 1986 1987 if (mask & 2) { 1988 bR = BasicBlock::get(ei.getNode()); 1989 for (insn = bR->getEntry(); insn; insn = insn->next, ++nR) 1990 if (!mayPredicate(insn, pred)) 1991 return false; 1992 if (nR > limit) 1993 return false; // too long, do a real branch 1994 } 1995 1996 if (bL) 1997 predicateInstructions(bL, pred, bb->getExit()->cc); 1998 if (bR) 1999 predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc)); 2000 2001 if (bb->joinAt) { 2002 bb->remove(bb->joinAt); 2003 bb->joinAt = NULL; 2004 } 2005 removeFlow(bb->getExit()); // delete the branch/join at the fork point 2006 2007 // remove potential join operations at the end of the conditional 2008 if (prog->getTarget()->joinAnterior) { 2009 bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode()); 2010 if (bb->getEntry() && bb->getEntry()->op == OP_JOIN) 2011 removeFlow(bb->getEntry()); 2012 } 2013 2014 return true; 2015 } 2016 2017 // ============================================================================= 2018 2019 // Common subexpression elimination. Stupid O^2 implementation. 2020 class LocalCSE : public Pass 2021 { 2022 private: 2023 virtual bool visit(BasicBlock *); 2024 2025 inline bool tryReplace(Instruction **, Instruction *); 2026 2027 DLList ops[OP_LAST + 1]; 2028 }; 2029 2030 class GlobalCSE : public Pass 2031 { 2032 private: 2033 virtual bool visit(BasicBlock *); 2034 }; 2035 2036 bool 2037 Instruction::isActionEqual(const Instruction *that) const 2038 { 2039 if (this->op != that->op || 2040 this->dType != that->dType || 2041 this->sType != that->sType) 2042 return false; 2043 if (this->cc != that->cc) 2044 return false; 2045 2046 if (this->asTex()) { 2047 if (memcmp(&this->asTex()->tex, 2048 &that->asTex()->tex, 2049 sizeof(this->asTex()->tex))) 2050 return false; 2051 } else 2052 if (this->asCmp()) { 2053 if (this->asCmp()->setCond != that->asCmp()->setCond) 2054 return false; 2055 } else 2056 if (this->asFlow()) { 2057 return false; 2058 } else { 2059 if (this->atomic != that->atomic || 2060 this->ipa != that->ipa || 2061 this->lanes != that->lanes || 2062 this->perPatch != that->perPatch) 2063 return false; 2064 if (this->postFactor != that->postFactor) 2065 return false; 2066 } 2067 2068 if (this->subOp != that->subOp || 2069 this->saturate != that->saturate || 2070 this->rnd != that->rnd || 2071 this->ftz != that->ftz || 2072 this->dnz != that->dnz || 2073 this->cache != that->cache) 2074 return false; 2075 2076 return true; 2077 } 2078 2079 bool 2080 Instruction::isResultEqual(const Instruction *that) const 2081 { 2082 unsigned int d, s; 2083 2084 // NOTE: location of discard only affects tex with liveOnly and quadops 2085 if (!this->defExists(0) && this->op != OP_DISCARD) 2086 return false; 2087 2088 if (!isActionEqual(that)) 2089 return false; 2090 2091 if (this->predSrc != that->predSrc) 2092 return false; 2093 2094 for (d = 0; this->defExists(d); ++d) { 2095 if (!that->defExists(d) || 2096 !this->getDef(d)->equals(that->getDef(d), false)) 2097 return false; 2098 } 2099 if (that->defExists(d)) 2100 return false; 2101 2102 for (s = 0; this->srcExists(s); ++s) { 2103 if (!that->srcExists(s)) 2104 return false; 2105 if (this->src(s).mod != that->src(s).mod) 2106 return false; 2107 if (!this->getSrc(s)->equals(that->getSrc(s), true)) 2108 return false; 2109 } 2110 if (that->srcExists(s)) 2111 return false; 2112 2113 if (op == OP_LOAD || op == OP_VFETCH) { 2114 switch (src(0).getFile()) { 2115 case FILE_MEMORY_CONST: 2116 case FILE_SHADER_INPUT: 2117 return true; 2118 default: 2119 return false; 2120 } 2121 } 2122 2123 return true; 2124 } 2125 2126 // pull through common expressions from different in-blocks 2127 bool 2128 GlobalCSE::visit(BasicBlock *bb) 2129 { 2130 Instruction *phi, *next, *ik; 2131 int s; 2132 2133 // TODO: maybe do this with OP_UNION, too 2134 2135 for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) { 2136 next = phi->next; 2137 if (phi->getSrc(0)->refCount() > 1) 2138 continue; 2139 ik = phi->getSrc(0)->getInsn(); 2140 if (!ik) 2141 continue; // probably a function input 2142 for (s = 1; phi->srcExists(s); ++s) { 2143 if (phi->getSrc(s)->refCount() > 1) 2144 break; 2145 if (!phi->getSrc(s)->getInsn() || 2146 !phi->getSrc(s)->getInsn()->isResultEqual(ik)) 2147 break; 2148 } 2149 if (!phi->srcExists(s)) { 2150 Instruction *entry = bb->getEntry(); 2151 ik->bb->remove(ik); 2152 if (!entry || entry->op != OP_JOIN) 2153 bb->insertHead(ik); 2154 else 2155 bb->insertAfter(entry, ik); 2156 ik->setDef(0, phi->getDef(0)); 2157 delete_Instruction(prog, phi); 2158 } 2159 } 2160 2161 return true; 2162 } 2163 2164 bool 2165 LocalCSE::tryReplace(Instruction **ptr, Instruction *i) 2166 { 2167 Instruction *old = *ptr; 2168 2169 // TODO: maybe relax this later (causes trouble with OP_UNION) 2170 if (i->isPredicated()) 2171 return false; 2172 2173 if (!old->isResultEqual(i)) 2174 return false; 2175 2176 for (int d = 0; old->defExists(d); ++d) 2177 old->def(d).replace(i->getDef(d), false); 2178 delete_Instruction(prog, old); 2179 *ptr = NULL; 2180 return true; 2181 } 2182 2183 bool 2184 LocalCSE::visit(BasicBlock *bb) 2185 { 2186 unsigned int replaced; 2187 2188 do { 2189 Instruction *ir, *next; 2190 2191 replaced = 0; 2192 2193 // will need to know the order of instructions 2194 int serial = 0; 2195 for (ir = bb->getFirst(); ir; ir = ir->next) 2196 ir->serial = serial++; 2197 2198 for (ir = bb->getEntry(); ir; ir = next) { 2199 int s; 2200 Value *src = NULL; 2201 2202 next = ir->next; 2203 2204 if (ir->fixed) { 2205 ops[ir->op].insert(ir); 2206 continue; 2207 } 2208 2209 for (s = 0; ir->srcExists(s); ++s) 2210 if (ir->getSrc(s)->asLValue()) 2211 if (!src || ir->getSrc(s)->refCount() < src->refCount()) 2212 src = ir->getSrc(s); 2213 2214 if (src) { 2215 for (Value::UseIterator it = src->uses.begin(); 2216 it != src->uses.end(); ++it) { 2217 Instruction *ik = (*it)->getInsn(); 2218 if (ik && ik->bb == ir->bb && ik->serial < ir->serial) 2219 if (tryReplace(&ir, ik)) 2220 break; 2221 } 2222 } else { 2223 DLLIST_FOR_EACH(&ops[ir->op], iter) 2224 { 2225 Instruction *ik = reinterpret_cast<Instruction *>(iter.get()); 2226 if (tryReplace(&ir, ik)) 2227 break; 2228 } 2229 } 2230 2231 if (ir) 2232 ops[ir->op].insert(ir); 2233 else 2234 ++replaced; 2235 } 2236 for (unsigned int i = 0; i <= OP_LAST; ++i) 2237 ops[i].clear(); 2238 2239 } while (replaced); 2240 2241 return true; 2242 } 2243 2244 // ============================================================================= 2245 2246 // Remove computations of unused values. 2247 class DeadCodeElim : public Pass 2248 { 2249 public: 2250 bool buryAll(Program *); 2251 2252 private: 2253 virtual bool visit(BasicBlock *); 2254 2255 void checkSplitLoad(Instruction *ld); // for partially dead loads 2256 2257 unsigned int deadCount; 2258 }; 2259 2260 bool 2261 DeadCodeElim::buryAll(Program *prog) 2262 { 2263 do { 2264 deadCount = 0; 2265 if (!this->run(prog, false, false)) 2266 return false; 2267 } while (deadCount); 2268 2269 return true; 2270 } 2271 2272 bool 2273 DeadCodeElim::visit(BasicBlock *bb) 2274 { 2275 Instruction *next; 2276 2277 for (Instruction *i = bb->getFirst(); i; i = next) { 2278 next = i->next; 2279 if (i->isDead()) { 2280 ++deadCount; 2281 delete_Instruction(prog, i); 2282 } else 2283 if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) { 2284 checkSplitLoad(i); 2285 } 2286 } 2287 return true; 2288 } 2289 2290 void 2291 DeadCodeElim::checkSplitLoad(Instruction *ld1) 2292 { 2293 Instruction *ld2 = NULL; // can get at most 2 loads 2294 Value *def1[4]; 2295 Value *def2[4]; 2296 int32_t addr1, addr2; 2297 int32_t size1, size2; 2298 int d, n1, n2; 2299 uint32_t mask = 0xffffffff; 2300 2301 for (d = 0; ld1->defExists(d); ++d) 2302 if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0) 2303 mask &= ~(1 << d); 2304 if (mask == 0xffffffff) 2305 return; 2306 2307 addr1 = ld1->getSrc(0)->reg.data.offset; 2308 n1 = n2 = 0; 2309 size1 = size2 = 0; 2310 for (d = 0; ld1->defExists(d); ++d) { 2311 if (mask & (1 << d)) { 2312 if (size1 && (addr1 & 0x7)) 2313 break; 2314 def1[n1] = ld1->getDef(d); 2315 size1 += def1[n1++]->reg.size; 2316 } else 2317 if (!n1) { 2318 addr1 += ld1->getDef(d)->reg.size; 2319 } else { 2320 break; 2321 } 2322 } 2323 for (addr2 = addr1 + size1; ld1->defExists(d); ++d) { 2324 if (mask & (1 << d)) { 2325 def2[n2] = ld1->getDef(d); 2326 size2 += def2[n2++]->reg.size; 2327 } else { 2328 assert(!n2); 2329 addr2 += ld1->getDef(d)->reg.size; 2330 } 2331 } 2332 2333 updateLdStOffset(ld1, addr1, func); 2334 ld1->setType(typeOfSize(size1)); 2335 for (d = 0; d < 4; ++d) 2336 ld1->setDef(d, (d < n1) ? def1[d] : NULL); 2337 2338 if (!n2) 2339 return; 2340 2341 ld2 = cloneShallow(func, ld1); 2342 updateLdStOffset(ld2, addr2, func); 2343 ld2->setType(typeOfSize(size2)); 2344 for (d = 0; d < 4; ++d) 2345 ld2->setDef(d, (d < n2) ? def2[d] : NULL); 2346 2347 ld1->bb->insertAfter(ld1, ld2); 2348 } 2349 2350 // ============================================================================= 2351 2352 #define RUN_PASS(l, n, f) \ 2353 if (level >= (l)) { \ 2354 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \ 2355 INFO("PEEPHOLE: %s\n", #n); \ 2356 n pass; \ 2357 if (!pass.f(this)) \ 2358 return false; \ 2359 } 2360 2361 bool 2362 Program::optimizeSSA(int level) 2363 { 2364 RUN_PASS(1, DeadCodeElim, buryAll); 2365 RUN_PASS(1, CopyPropagation, run); 2366 RUN_PASS(2, GlobalCSE, run); 2367 RUN_PASS(1, LocalCSE, run); 2368 RUN_PASS(2, AlgebraicOpt, run); 2369 RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks 2370 RUN_PASS(1, ConstantFolding, foldAll); 2371 RUN_PASS(1, LoadPropagation, run); 2372 RUN_PASS(2, MemoryOpt, run); 2373 RUN_PASS(2, LocalCSE, run); 2374 RUN_PASS(0, DeadCodeElim, buryAll); 2375 2376 return true; 2377 } 2378 2379 bool 2380 Program::optimizePostRA(int level) 2381 { 2382 RUN_PASS(2, FlatteningPass, run); 2383 return true; 2384 } 2385 2386 } 2387