1 /* 2 * Copyright 2011 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "codegen/nv50_ir.h" 24 #include "codegen/nv50_ir_build_util.h" 25 26 #include "codegen/nv50_ir_target_nvc0.h" 27 #include "codegen/nv50_ir_lowering_nvc0.h" 28 29 #include <limits> 30 31 namespace nv50_ir { 32 33 #define QOP_ADD 0 34 #define QOP_SUBR 1 35 #define QOP_SUB 2 36 #define QOP_MOV2 3 37 38 // UL UR LL LR 39 #define QUADOP(q, r, s, t) \ 40 ((QOP_##q << 6) | (QOP_##r << 4) | \ 41 (QOP_##s << 2) | (QOP_##t << 0)) 42 43 void 44 NVC0LegalizeSSA::handleDIV(Instruction *i) 45 { 46 FlowInstruction *call; 47 int builtin; 48 Value *def[2]; 49 50 bld.setPosition(i, false); 51 def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0); 52 def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0); 53 switch (i->dType) { 54 case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break; 55 case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break; 56 default: 57 return; 58 } 59 call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL); 60 bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]); 61 bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2); 62 bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0); 63 64 call->fixed = 1; 65 call->absolute = call->builtin = 1; 66 call->target.builtin = builtin; 67 delete_Instruction(prog, i); 68 } 69 70 void 71 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) 72 { 73 assert(i->dType == TYPE_F64); 74 // There are instructions that will compute the high 32 bits of the 64-bit 75 // float. We will just stick 0 in the bottom 32 bits. 76 77 bld.setPosition(i, false); 78 79 // 1. Take the source and it up. 80 Value *src[2], *dst[2], *def = i->getDef(0); 81 bld.mkSplit(src, 4, i->getSrc(0)); 82 83 // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. 84 dst[0] = bld.loadImm(NULL, 0); 85 dst[1] = bld.getSSA(); 86 87 // 3. The new version of the instruction takes the high 32 bits of the 88 // source and outputs the high 32 bits of the destination. 89 i->setSrc(0, src[1]); 90 i->setDef(0, dst[1]); 91 i->setType(TYPE_F32); 92 i->subOp = NV50_IR_SUBOP_RCPRSQ_64H; 93 94 // 4. Recombine the two dst pieces back into the original destination. 95 bld.setPosition(i, true); 96 bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]); 97 } 98 99 void 100 NVC0LegalizeSSA::handleFTZ(Instruction *i) 101 { 102 // Only want to flush float inputs 103 assert(i->sType == TYPE_F32); 104 105 // If we're already flushing denorms (and NaN's) to zero, no need for this. 106 if (i->dnz) 107 return; 108 109 // Only certain classes of operations can flush 110 OpClass cls = prog->getTarget()->getOpClass(i->op); 111 if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE && 112 cls != OPCLASS_CONVERT) 113 return; 114 115 i->ftz = true; 116 } 117 118 void 119 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i) 120 { 121 if (i->tex.levelZero) 122 return; 123 124 ImmediateValue lod; 125 126 // The LOD argument comes right after the coordinates (before depth bias, 127 // offsets, etc). 128 int arg = i->tex.target.getArgCount(); 129 130 // SM30+ stores the indirect handle as a separate arg, which comes before 131 // the LOD. 132 if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET && 133 i->tex.rIndirectSrc >= 0) 134 arg++; 135 // SM20 stores indirect handle combined with array coordinate 136 if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET && 137 !i->tex.target.isArray() && 138 i->tex.rIndirectSrc >= 0) 139 arg++; 140 141 if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0)) 142 return; 143 144 if (i->op == OP_TXL) 145 i->op = OP_TEX; 146 i->tex.levelZero = true; 147 i->moveSources(arg + 1, -1); 148 } 149 150 bool 151 NVC0LegalizeSSA::visit(Function *fn) 152 { 153 bld.setProgram(fn->getProgram()); 154 return true; 155 } 156 157 bool 158 NVC0LegalizeSSA::visit(BasicBlock *bb) 159 { 160 Instruction *next; 161 for (Instruction *i = bb->getEntry(); i; i = next) { 162 next = i->next; 163 164 if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE) 165 handleFTZ(i); 166 167 switch (i->op) { 168 case OP_DIV: 169 case OP_MOD: 170 if (i->sType != TYPE_F32) 171 handleDIV(i); 172 break; 173 case OP_RCP: 174 case OP_RSQ: 175 if (i->dType == TYPE_F64) 176 handleRCPRSQ(i); 177 break; 178 case OP_TXL: 179 case OP_TXF: 180 handleTEXLOD(i->asTex()); 181 break; 182 default: 183 break; 184 } 185 } 186 return true; 187 } 188 189 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog) 190 : rZero(NULL), 191 carry(NULL), 192 pOne(NULL), 193 needTexBar(prog->getTarget()->getChipset() >= 0xe0 && 194 prog->getTarget()->getChipset() < 0x110) 195 { 196 } 197 198 bool 199 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later, 200 const Instruction *early) const 201 { 202 if (early->bb == later->bb) 203 return early->serial < later->serial; 204 return later->bb->dominatedBy(early->bb); 205 } 206 207 void 208 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses, 209 Instruction *usei, const Instruction *texi) 210 { 211 bool add = true; 212 bool dominated = insnDominatedBy(usei, texi); 213 // Uses before the tex have to all be included. Just because an earlier 214 // instruction dominates another instruction doesn't mean that there's no 215 // way to get from the tex to the later instruction. For example you could 216 // have nested loops, with the tex in the inner loop, and uses before it in 217 // both loops - even though the outer loop's instruction would dominate the 218 // inner's, we still want a texbar before the inner loop's instruction. 219 // 220 // However we can still use the eliding logic between uses dominated by the 221 // tex instruction, as that is unambiguously correct. 222 if (dominated) { 223 for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) { 224 if (it->after) { 225 if (insnDominatedBy(usei, it->insn)) { 226 add = false; 227 break; 228 } 229 if (insnDominatedBy(it->insn, usei)) { 230 it = uses.erase(it); 231 continue; 232 } 233 } 234 ++it; 235 } 236 } 237 if (add) 238 uses.push_back(TexUse(usei, texi, dominated)); 239 } 240 241 // While it might be tempting to use the an algorithm that just looks at tex 242 // uses, not all texture results are guaranteed to be used on all paths. In 243 // the case where along some control flow path a texture result is never used, 244 // we might reuse that register for something else, creating a 245 // write-after-write hazard. So we have to manually look through all 246 // instructions looking for ones that reference the registers in question. 247 void 248 NVC0LegalizePostRA::findFirstUses( 249 Instruction *texi, std::list<TexUse> &uses) 250 { 251 int minGPR = texi->def(0).rep()->reg.data.id; 252 int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1; 253 254 unordered_set<const BasicBlock *> visited; 255 findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited); 256 } 257 258 void 259 NVC0LegalizePostRA::findFirstUsesBB( 260 int minGPR, int maxGPR, Instruction *start, 261 const Instruction *texi, std::list<TexUse> &uses, 262 unordered_set<const BasicBlock *> &visited) 263 { 264 const BasicBlock *bb = start->bb; 265 266 // We don't process the whole bb the first time around. This is correct, 267 // however we might be in a loop and hit this BB again, and need to process 268 // the full thing. So only mark a bb as visited if we processed it from the 269 // beginning. 270 if (start == bb->getEntry()) { 271 if (visited.find(bb) != visited.end()) 272 return; 273 visited.insert(bb); 274 } 275 276 for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) { 277 if (insn->isNop()) 278 continue; 279 280 for (int d = 0; insn->defExists(d); ++d) { 281 const Value *def = insn->def(d).rep(); 282 if (insn->def(d).getFile() != FILE_GPR || 283 def->reg.data.id + def->reg.size / 4 - 1 < minGPR || 284 def->reg.data.id > maxGPR) 285 continue; 286 addTexUse(uses, insn, texi); 287 return; 288 } 289 290 for (int s = 0; insn->srcExists(s); ++s) { 291 const Value *src = insn->src(s).rep(); 292 if (insn->src(s).getFile() != FILE_GPR || 293 src->reg.data.id + src->reg.size / 4 - 1 < minGPR || 294 src->reg.data.id > maxGPR) 295 continue; 296 addTexUse(uses, insn, texi); 297 return; 298 } 299 } 300 301 for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { 302 findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(), 303 texi, uses, visited); 304 } 305 } 306 307 // Texture barriers: 308 // This pass is a bit long and ugly and can probably be optimized. 309 // 310 // 1. obtain a list of TEXes and their outputs' first use(s) 311 // 2. calculate the barrier level of each first use (minimal number of TEXes, 312 // over all paths, between the TEX and the use in question) 313 // 3. for each barrier, if all paths from the source TEX to that barrier 314 // contain a barrier of lesser level, it can be culled 315 bool 316 NVC0LegalizePostRA::insertTextureBarriers(Function *fn) 317 { 318 std::list<TexUse> *uses; 319 std::vector<Instruction *> texes; 320 std::vector<int> bbFirstTex; 321 std::vector<int> bbFirstUse; 322 std::vector<int> texCounts; 323 std::vector<TexUse> useVec; 324 ArrayList insns; 325 326 fn->orderInstructions(insns); 327 328 texCounts.resize(fn->allBBlocks.getSize(), 0); 329 bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize()); 330 bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize()); 331 332 // tag BB CFG nodes by their id for later 333 for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) { 334 BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get()); 335 if (bb) 336 bb->cfg.tag = bb->getId(); 337 } 338 339 // gather the first uses for each TEX 340 for (int i = 0; i < insns.getSize(); ++i) { 341 Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i)); 342 if (isTextureOp(tex->op)) { 343 texes.push_back(tex); 344 if (!texCounts.at(tex->bb->getId())) 345 bbFirstTex[tex->bb->getId()] = texes.size() - 1; 346 texCounts[tex->bb->getId()]++; 347 } 348 } 349 insns.clear(); 350 if (texes.empty()) 351 return false; 352 uses = new std::list<TexUse>[texes.size()]; 353 if (!uses) 354 return false; 355 for (size_t i = 0; i < texes.size(); ++i) { 356 findFirstUses(texes[i], uses[i]); 357 } 358 359 // determine the barrier level at each use 360 for (size_t i = 0; i < texes.size(); ++i) { 361 for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end(); 362 ++u) { 363 BasicBlock *tb = texes[i]->bb; 364 BasicBlock *ub = u->insn->bb; 365 if (tb == ub) { 366 u->level = 0; 367 for (size_t j = i + 1; j < texes.size() && 368 texes[j]->bb == tb && texes[j]->serial < u->insn->serial; 369 ++j) 370 u->level++; 371 } else { 372 u->level = fn->cfg.findLightestPathWeight(&tb->cfg, 373 &ub->cfg, texCounts); 374 if (u->level < 0) { 375 WARN("Failed to find path TEX -> TEXBAR\n"); 376 u->level = 0; 377 continue; 378 } 379 // this counted all TEXes in the origin block, correct that 380 u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */; 381 // and did not count the TEXes in the destination block, add those 382 for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() && 383 texes[j]->bb == ub && texes[j]->serial < u->insn->serial; 384 ++j) 385 u->level++; 386 } 387 assert(u->level >= 0); 388 useVec.push_back(*u); 389 } 390 } 391 delete[] uses; 392 393 // insert the barriers 394 for (size_t i = 0; i < useVec.size(); ++i) { 395 Instruction *prev = useVec[i].insn->prev; 396 if (useVec[i].level < 0) 397 continue; 398 if (prev && prev->op == OP_TEXBAR) { 399 if (prev->subOp > useVec[i].level) 400 prev->subOp = useVec[i].level; 401 prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0)); 402 } else { 403 Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE); 404 bar->fixed = 1; 405 bar->subOp = useVec[i].level; 406 // make use explicit to ease latency calculation 407 bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0)); 408 useVec[i].insn->bb->insertBefore(useVec[i].insn, bar); 409 } 410 } 411 412 if (fn->getProgram()->optLevel < 3) 413 return true; 414 415 std::vector<Limits> limitT, limitB, limitS; // entry, exit, single 416 417 limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0)); 418 limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0)); 419 limitS.resize(fn->allBBlocks.getSize()); 420 421 // cull unneeded barriers (should do that earlier, but for simplicity) 422 IteratorRef bi = fn->cfg.iteratorCFG(); 423 // first calculate min/max outstanding TEXes for each BB 424 for (bi->reset(); !bi->end(); bi->next()) { 425 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get()); 426 BasicBlock *bb = BasicBlock::get(n); 427 int min = 0; 428 int max = std::numeric_limits<int>::max(); 429 for (Instruction *i = bb->getFirst(); i; i = i->next) { 430 if (isTextureOp(i->op)) { 431 min++; 432 if (max < std::numeric_limits<int>::max()) 433 max++; 434 } else 435 if (i->op == OP_TEXBAR) { 436 min = MIN2(min, i->subOp); 437 max = MIN2(max, i->subOp); 438 } 439 } 440 // limits when looking at an isolated block 441 limitS[bb->getId()].min = min; 442 limitS[bb->getId()].max = max; 443 } 444 // propagate the min/max values 445 for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) { 446 for (bi->reset(); !bi->end(); bi->next()) { 447 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get()); 448 BasicBlock *bb = BasicBlock::get(n); 449 const int bbId = bb->getId(); 450 for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) { 451 BasicBlock *in = BasicBlock::get(ei.getNode()); 452 const int inId = in->getId(); 453 limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min); 454 limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max); 455 } 456 // I just hope this is correct ... 457 if (limitS[bbId].max == std::numeric_limits<int>::max()) { 458 // no barrier 459 limitB[bbId].min = limitT[bbId].min + limitS[bbId].min; 460 limitB[bbId].max = limitT[bbId].max + limitS[bbId].min; 461 } else { 462 // block contained a barrier 463 limitB[bbId].min = MIN2(limitS[bbId].max, 464 limitT[bbId].min + limitS[bbId].min); 465 limitB[bbId].max = MIN2(limitS[bbId].max, 466 limitT[bbId].max + limitS[bbId].min); 467 } 468 } 469 } 470 // finally delete unnecessary barriers 471 for (bi->reset(); !bi->end(); bi->next()) { 472 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get()); 473 BasicBlock *bb = BasicBlock::get(n); 474 Instruction *prev = NULL; 475 Instruction *next; 476 int max = limitT[bb->getId()].max; 477 for (Instruction *i = bb->getFirst(); i; i = next) { 478 next = i->next; 479 if (i->op == OP_TEXBAR) { 480 if (i->subOp >= max) { 481 delete_Instruction(prog, i); 482 i = NULL; 483 } else { 484 max = i->subOp; 485 if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) { 486 delete_Instruction(prog, prev); 487 prev = NULL; 488 } 489 } 490 } else 491 if (isTextureOp(i->op)) { 492 max++; 493 } 494 if (i && !i->isNop()) 495 prev = i; 496 } 497 } 498 return true; 499 } 500 501 bool 502 NVC0LegalizePostRA::visit(Function *fn) 503 { 504 if (needTexBar) 505 insertTextureBarriers(fn); 506 507 rZero = new_LValue(fn, FILE_GPR); 508 pOne = new_LValue(fn, FILE_PREDICATE); 509 carry = new_LValue(fn, FILE_FLAGS); 510 511 rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63; 512 carry->reg.data.id = 0; 513 pOne->reg.data.id = 7; 514 515 return true; 516 } 517 518 void 519 NVC0LegalizePostRA::replaceZero(Instruction *i) 520 { 521 for (int s = 0; i->srcExists(s); ++s) { 522 if (s == 2 && i->op == OP_SUCLAMP) 523 continue; 524 ImmediateValue *imm = i->getSrc(s)->asImm(); 525 if (imm) { 526 if (i->op == OP_SELP && s == 2) { 527 i->setSrc(s, pOne); 528 if (imm->reg.data.u64 == 0) 529 i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT); 530 } else if (imm->reg.data.u64 == 0) { 531 i->setSrc(s, rZero); 532 } 533 } 534 } 535 } 536 537 // replace CONT with BRA for single unconditional continue 538 bool 539 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb) 540 { 541 if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT) 542 return false; 543 Graph::EdgeIterator ei = bb->cfg.incident(); 544 if (ei.getType() != Graph::Edge::BACK) 545 ei.next(); 546 if (ei.getType() != Graph::Edge::BACK) 547 return false; 548 BasicBlock *contBB = BasicBlock::get(ei.getNode()); 549 550 if (!contBB->getExit() || contBB->getExit()->op != OP_CONT || 551 contBB->getExit()->getPredicate()) 552 return false; 553 contBB->getExit()->op = OP_BRA; 554 bb->remove(bb->getEntry()); // delete PRECONT 555 556 ei.next(); 557 assert(ei.end() || ei.getType() != Graph::Edge::BACK); 558 return true; 559 } 560 561 // replace branches to join blocks with join ops 562 void 563 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb) 564 { 565 if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit) 566 return; 567 for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { 568 BasicBlock *in = BasicBlock::get(ei.getNode()); 569 Instruction *exit = in->getExit(); 570 if (!exit) { 571 in->insertTail(new FlowInstruction(func, OP_JOIN, bb)); 572 // there should always be a terminator instruction 573 WARN("inserted missing terminator in BB:%i\n", in->getId()); 574 } else 575 if (exit->op == OP_BRA) { 576 exit->op = OP_JOIN; 577 exit->asFlow()->limit = 1; // must-not-propagate marker 578 } 579 } 580 bb->remove(bb->getEntry()); 581 } 582 583 bool 584 NVC0LegalizePostRA::visit(BasicBlock *bb) 585 { 586 Instruction *i, *next; 587 588 // remove pseudo operations and non-fixed no-ops, split 64 bit operations 589 for (i = bb->getFirst(); i; i = next) { 590 next = i->next; 591 if (i->op == OP_EMIT || i->op == OP_RESTART) { 592 if (!i->getDef(0)->refCount()) 593 i->setDef(0, NULL); 594 if (i->src(0).getFile() == FILE_IMMEDIATE) 595 i->setSrc(0, rZero); // initial value must be 0 596 replaceZero(i); 597 } else 598 if (i->isNop()) { 599 bb->remove(i); 600 } else 601 if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC && 602 prog->getType() != Program::TYPE_COMPUTE) { 603 // It seems like barriers are never required for tessellation since 604 // the warp size is 32, and there are always at most 32 tcs threads. 605 bb->remove(i); 606 } else 607 if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) { 608 int offset = i->src(0).get()->reg.data.offset; 609 if (abs(offset) > 0x10000) 610 i->src(0).get()->reg.fileIndex += offset >> 16; 611 i->src(0).get()->reg.data.offset = (int)(short)offset; 612 } else { 613 // TODO: Move this to before register allocation for operations that 614 // need the $c register ! 615 if (typeSizeof(i->dType) == 8) { 616 Instruction *hi; 617 hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry); 618 if (hi) 619 next = hi; 620 } 621 622 if (i->op != OP_MOV && i->op != OP_PFETCH) 623 replaceZero(i); 624 } 625 } 626 if (!bb->getEntry()) 627 return true; 628 629 if (!tryReplaceContWithBra(bb)) 630 propagateJoin(bb); 631 632 return true; 633 } 634 635 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget()) 636 { 637 bld.setProgram(prog); 638 } 639 640 bool 641 NVC0LoweringPass::visit(Function *fn) 642 { 643 if (prog->getType() == Program::TYPE_GEOMETRY) { 644 assert(!strncmp(fn->getName(), "MAIN", 4)); 645 // TODO: when we generate actual functions pass this value along somehow 646 bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false); 647 gpEmitAddress = bld.loadImm(NULL, 0)->asLValue(); 648 if (fn->cfgExit) { 649 bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false); 650 bld.mkMovToReg(0, gpEmitAddress); 651 } 652 } 653 return true; 654 } 655 656 bool 657 NVC0LoweringPass::visit(BasicBlock *bb) 658 { 659 return true; 660 } 661 662 inline Value * 663 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot) 664 { 665 uint8_t b = prog->driver->io.auxCBSlot; 666 uint32_t off = prog->driver->io.texBindBase + slot * 4; 667 668 if (ptr) 669 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2)); 670 671 return bld. 672 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); 673 } 674 675 // move array source to first slot, convert to u16, add indirections 676 bool 677 NVC0LoweringPass::handleTEX(TexInstruction *i) 678 { 679 const int dim = i->tex.target.getDim() + i->tex.target.isCube(); 680 const int arg = i->tex.target.getArgCount(); 681 const int lyr = arg - (i->tex.target.isMS() ? 2 : 1); 682 const int chipset = prog->getTarget()->getChipset(); 683 684 /* Only normalize in the non-explicit derivatives case. For explicit 685 * derivatives, this is handled in handleManualTXD. 686 */ 687 if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) { 688 Value *src[3], *val; 689 int c; 690 for (c = 0; c < 3; ++c) 691 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c)); 692 val = bld.getScratch(); 693 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 694 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 695 bld.mkOp1(OP_RCP, TYPE_F32, val, val); 696 for (c = 0; c < 3; ++c) { 697 i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), 698 i->getSrc(c), val)); 699 } 700 } 701 702 // Arguments to the TEX instruction are a little insane. Even though the 703 // encoding is identical between SM20 and SM30, the arguments mean 704 // different things between Fermi and Kepler+. A lot of arguments are 705 // optional based on flags passed to the instruction. This summarizes the 706 // order of things. 707 // 708 // Fermi: 709 // array/indirect 710 // coords 711 // sample 712 // lod bias 713 // depth compare 714 // offsets: 715 // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg) 716 // - other: 4 bits each, single reg 717 // 718 // Kepler+: 719 // indirect handle 720 // array (+ offsets for txd in upper 16 bits) 721 // coords 722 // sample 723 // lod bias 724 // depth compare 725 // offsets (same as fermi, except txd which takes it with array) 726 // 727 // Maxwell (tex): 728 // array 729 // coords 730 // indirect handle 731 // sample 732 // lod bias 733 // depth compare 734 // offsets 735 // 736 // Maxwell (txd): 737 // indirect handle 738 // coords 739 // array + offsets 740 // derivatives 741 742 if (chipset >= NVISA_GK104_CHIPSET) { 743 if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) { 744 // XXX this ignores tsc, and assumes a 1:1 mapping 745 assert(i->tex.rIndirectSrc >= 0); 746 Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r); 747 i->tex.r = 0xff; 748 i->tex.s = 0x1f; 749 i->setIndirectR(hnd); 750 i->setIndirectS(NULL); 751 } else if (i->tex.r == i->tex.s || i->op == OP_TXF) { 752 if (i->tex.r == 0xffff) 753 i->tex.r = prog->driver->io.fbtexBindBase / 4; 754 else 755 i->tex.r += prog->driver->io.texBindBase / 4; 756 i->tex.s = 0; // only a single cX[] value possible here 757 } else { 758 Value *hnd = bld.getScratch(); 759 Value *rHnd = loadTexHandle(NULL, i->tex.r); 760 Value *sHnd = loadTexHandle(NULL, i->tex.s); 761 762 bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd); 763 764 i->tex.r = 0; // not used for indirect tex 765 i->tex.s = 0; 766 i->setIndirectR(hnd); 767 } 768 if (i->tex.target.isArray()) { 769 LValue *layer = new_LValue(func, FILE_GPR); 770 Value *src = i->getSrc(lyr); 771 const int sat = (i->op == OP_TXF) ? 1 : 0; 772 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32; 773 bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat; 774 if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) { 775 for (int s = dim; s >= 1; --s) 776 i->setSrc(s, i->getSrc(s - 1)); 777 i->setSrc(0, layer); 778 } else { 779 i->setSrc(dim, layer); 780 } 781 } 782 // Move the indirect reference to the first place 783 if (i->tex.rIndirectSrc >= 0 && ( 784 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) { 785 Value *hnd = i->getIndirectR(); 786 787 i->setIndirectR(NULL); 788 i->moveSources(0, 1); 789 i->setSrc(0, hnd); 790 i->tex.rIndirectSrc = 0; 791 i->tex.sIndirectSrc = -1; 792 } 793 // Move the indirect reference to right after the coords 794 else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) { 795 Value *hnd = i->getIndirectR(); 796 797 i->setIndirectR(NULL); 798 i->moveSources(arg, 1); 799 i->setSrc(arg, hnd); 800 i->tex.rIndirectSrc = 0; 801 i->tex.sIndirectSrc = -1; 802 } 803 } else 804 // (nvc0) generate and move the tsc/tic/array source to the front 805 if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) { 806 LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa 807 808 Value *ticRel = i->getIndirectR(); 809 Value *tscRel = i->getIndirectS(); 810 811 if (i->tex.r == 0xffff) { 812 i->tex.r = 0x20; 813 i->tex.s = 0x10; 814 } 815 816 if (ticRel) { 817 i->setSrc(i->tex.rIndirectSrc, NULL); 818 if (i->tex.r) 819 ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), 820 ticRel, bld.mkImm(i->tex.r)); 821 } 822 if (tscRel) { 823 i->setSrc(i->tex.sIndirectSrc, NULL); 824 if (i->tex.s) 825 tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), 826 tscRel, bld.mkImm(i->tex.s)); 827 } 828 829 Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL; 830 if (arrayIndex) { 831 for (int s = dim; s >= 1; --s) 832 i->setSrc(s, i->getSrc(s - 1)); 833 i->setSrc(0, arrayIndex); 834 } else { 835 i->moveSources(0, 1); 836 } 837 838 if (arrayIndex) { 839 int sat = (i->op == OP_TXF) ? 1 : 0; 840 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32; 841 bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat; 842 } else { 843 bld.loadImm(src, 0); 844 } 845 846 if (ticRel) 847 bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src); 848 if (tscRel) 849 bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src); 850 851 i->setSrc(0, src); 852 } 853 854 // For nvc0, the sample id has to be in the second operand, as the offset 855 // does. Right now we don't know how to pass both in, and this case can't 856 // happen with OpenGL. On nve0, the sample id is part of the texture 857 // coordinate argument. 858 assert(chipset >= NVISA_GK104_CHIPSET || 859 !i->tex.useOffsets || !i->tex.target.isMS()); 860 861 // offset is between lod and dc 862 if (i->tex.useOffsets) { 863 int n, c; 864 int s = i->srcCount(0xff, true); 865 if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) { 866 if (i->tex.target.isShadow()) 867 s--; 868 if (i->srcExists(s)) // move potential predicate out of the way 869 i->moveSources(s, 1); 870 if (i->tex.useOffsets == 4 && i->srcExists(s + 1)) 871 i->moveSources(s + 1, 1); 872 } 873 if (i->op == OP_TXG) { 874 // Either there is 1 offset, which goes into the 2 low bytes of the 875 // first source, or there are 4 offsets, which go into 2 sources (8 876 // values, 1 byte each). 877 Value *offs[2] = {NULL, NULL}; 878 for (n = 0; n < i->tex.useOffsets; n++) { 879 for (c = 0; c < 2; ++c) { 880 if ((n % 2) == 0 && c == 0) 881 bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get()); 882 else 883 bld.mkOp3(OP_INSBF, TYPE_U32, 884 offs[n / 2], 885 i->offset[n][c].get(), 886 bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)), 887 offs[n / 2]); 888 } 889 } 890 i->setSrc(s, offs[0]); 891 if (offs[1]) 892 i->setSrc(s + 1, offs[1]); 893 } else { 894 unsigned imm = 0; 895 assert(i->tex.useOffsets == 1); 896 for (c = 0; c < 3; ++c) { 897 ImmediateValue val; 898 if (!i->offset[0][c].getImmediate(val)) 899 assert(!"non-immediate offset passed to non-TXG"); 900 imm |= (val.reg.data.u32 & 0xf) << (c * 4); 901 } 902 if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) { 903 // The offset goes into the upper 16 bits of the array index. So 904 // create it if it's not already there, and INSBF it if it already 905 // is. 906 s = (i->tex.rIndirectSrc >= 0) ? 1 : 0; 907 if (chipset >= NVISA_GM107_CHIPSET) 908 s += dim; 909 if (i->tex.target.isArray()) { 910 bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s), 911 bld.loadImm(NULL, imm), bld.mkImm(0xc10), 912 i->getSrc(s)); 913 } else { 914 i->moveSources(s, 1); 915 i->setSrc(s, bld.loadImm(NULL, imm << 16)); 916 } 917 } else { 918 i->setSrc(s, bld.loadImm(NULL, imm)); 919 } 920 } 921 } 922 923 if (chipset >= NVISA_GK104_CHIPSET) { 924 // 925 // If TEX requires more than 4 sources, the 2nd register tuple must be 926 // aligned to 4, even if it consists of just a single 4-byte register. 927 // 928 // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case. 929 // 930 int s = i->srcCount(0xff, true); 931 if (s > 4 && s < 7) { 932 if (i->srcExists(s)) // move potential predicate out of the way 933 i->moveSources(s, 7 - s); 934 while (s < 7) 935 i->setSrc(s++, bld.loadImm(NULL, 0)); 936 } 937 } 938 939 return true; 940 } 941 942 bool 943 NVC0LoweringPass::handleManualTXD(TexInstruction *i) 944 { 945 static const uint8_t qOps[4][2] = 946 { 947 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 948 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 949 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 950 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 951 }; 952 Value *def[4][4]; 953 Value *crd[3]; 954 Instruction *tex; 955 Value *zero = bld.loadImm(bld.getSSA(), 0); 956 int l, c; 957 const int dim = i->tex.target.getDim() + i->tex.target.isCube(); 958 959 // This function is invoked after handleTEX lowering, so we have to expect 960 // the arguments in the order that the hw wants them. For Fermi, array and 961 // indirect are both in the leading arg, while for Kepler, array and 962 // indirect are separate (and both precede the coordinates). Maxwell is 963 // handled in a separate function. 964 unsigned array; 965 if (targ->getChipset() < NVISA_GK104_CHIPSET) 966 array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0; 967 else 968 array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0); 969 970 i->op = OP_TEX; // no need to clone dPdx/dPdy later 971 972 for (c = 0; c < dim; ++c) 973 crd[c] = bld.getScratch(); 974 975 bld.mkOp(OP_QUADON, TYPE_NONE, NULL); 976 for (l = 0; l < 4; ++l) { 977 Value *src[3], *val; 978 // mov coordinates from lane l to all lanes 979 for (c = 0; c < dim; ++c) 980 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero); 981 // add dPdx from lane l to lanes dx 982 for (c = 0; c < dim; ++c) 983 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]); 984 // add dPdy from lane l to lanes dy 985 for (c = 0; c < dim; ++c) 986 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); 987 // normalize cube coordinates 988 if (i->tex.target.isCube()) { 989 for (c = 0; c < 3; ++c) 990 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); 991 val = bld.getScratch(); 992 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 993 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 994 bld.mkOp1(OP_RCP, TYPE_F32, val, val); 995 for (c = 0; c < 3; ++c) 996 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); 997 } else { 998 for (c = 0; c < dim; ++c) 999 src[c] = crd[c]; 1000 } 1001 // texture 1002 bld.insert(tex = cloneForward(func, i)); 1003 for (c = 0; c < dim; ++c) 1004 tex->setSrc(c + array, src[c]); 1005 // save results 1006 for (c = 0; i->defExists(c); ++c) { 1007 Instruction *mov; 1008 def[c][l] = bld.getSSA(); 1009 mov = bld.mkMov(def[c][l], tex->getDef(c)); 1010 mov->fixed = 1; 1011 mov->lanes = 1 << l; 1012 } 1013 } 1014 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); 1015 1016 for (c = 0; i->defExists(c); ++c) { 1017 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); 1018 for (l = 0; l < 4; ++l) 1019 u->setSrc(l, def[c][l]); 1020 } 1021 1022 i->bb->remove(i); 1023 return true; 1024 } 1025 1026 bool 1027 NVC0LoweringPass::handleTXD(TexInstruction *txd) 1028 { 1029 int dim = txd->tex.target.getDim() + txd->tex.target.isCube(); 1030 unsigned arg = txd->tex.target.getArgCount(); 1031 unsigned expected_args = arg; 1032 const int chipset = prog->getTarget()->getChipset(); 1033 1034 if (chipset >= NVISA_GK104_CHIPSET) { 1035 if (!txd->tex.target.isArray() && txd->tex.useOffsets) 1036 expected_args++; 1037 if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0) 1038 expected_args++; 1039 } else { 1040 if (txd->tex.useOffsets) 1041 expected_args++; 1042 if (!txd->tex.target.isArray() && ( 1043 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)) 1044 expected_args++; 1045 } 1046 1047 if (expected_args > 4 || 1048 dim > 2 || 1049 txd->tex.target.isShadow()) 1050 txd->op = OP_TEX; 1051 1052 handleTEX(txd); 1053 while (txd->srcExists(arg)) 1054 ++arg; 1055 1056 txd->tex.derivAll = true; 1057 if (txd->op == OP_TEX) 1058 return handleManualTXD(txd); 1059 1060 assert(arg == expected_args); 1061 for (int c = 0; c < dim; ++c) { 1062 txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]); 1063 txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]); 1064 txd->dPdx[c].set(NULL); 1065 txd->dPdy[c].set(NULL); 1066 } 1067 1068 // In this case we have fewer than 4 "real" arguments, which means that 1069 // handleTEX didn't apply any padding. However we have to make sure that 1070 // the second "group" of arguments still gets padded up to 4. 1071 if (chipset >= NVISA_GK104_CHIPSET) { 1072 int s = arg + 2 * dim; 1073 if (s >= 4 && s < 7) { 1074 if (txd->srcExists(s)) // move potential predicate out of the way 1075 txd->moveSources(s, 7 - s); 1076 while (s < 7) 1077 txd->setSrc(s++, bld.loadImm(NULL, 0)); 1078 } 1079 } 1080 1081 return true; 1082 } 1083 1084 bool 1085 NVC0LoweringPass::handleTXQ(TexInstruction *txq) 1086 { 1087 const int chipset = prog->getTarget()->getChipset(); 1088 if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0) 1089 txq->tex.r += prog->driver->io.texBindBase / 4; 1090 1091 if (txq->tex.rIndirectSrc < 0) 1092 return true; 1093 1094 Value *ticRel = txq->getIndirectR(); 1095 1096 txq->setIndirectS(NULL); 1097 txq->tex.sIndirectSrc = -1; 1098 1099 assert(ticRel); 1100 1101 if (chipset < NVISA_GK104_CHIPSET) { 1102 LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa 1103 1104 txq->setSrc(txq->tex.rIndirectSrc, NULL); 1105 if (txq->tex.r) 1106 ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), 1107 ticRel, bld.mkImm(txq->tex.r)); 1108 1109 bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17)); 1110 1111 txq->moveSources(0, 1); 1112 txq->setSrc(0, src); 1113 } else { 1114 Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r); 1115 txq->tex.r = 0xff; 1116 txq->tex.s = 0x1f; 1117 1118 txq->setIndirectR(NULL); 1119 txq->moveSources(0, 1); 1120 txq->setSrc(0, hnd); 1121 txq->tex.rIndirectSrc = 0; 1122 } 1123 1124 return true; 1125 } 1126 1127 bool 1128 NVC0LoweringPass::handleTXLQ(TexInstruction *i) 1129 { 1130 /* The outputs are inverted compared to what the TGSI instruction 1131 * expects. Take that into account in the mask. 1132 */ 1133 assert((i->tex.mask & ~3) == 0); 1134 if (i->tex.mask == 1) 1135 i->tex.mask = 2; 1136 else if (i->tex.mask == 2) 1137 i->tex.mask = 1; 1138 handleTEX(i); 1139 bld.setPosition(i, true); 1140 1141 /* The returned values are not quite what we want: 1142 * (a) convert from s16/u16 to f32 1143 * (b) multiply by 1/256 1144 */ 1145 for (int def = 0; def < 2; ++def) { 1146 if (!i->defExists(def)) 1147 continue; 1148 enum DataType type = TYPE_S16; 1149 if (i->tex.mask == 2 || def > 0) 1150 type = TYPE_U16; 1151 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def)); 1152 bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def), 1153 i->getDef(def), bld.loadImm(NULL, 1.0f / 256)); 1154 } 1155 if (i->tex.mask == 3) { 1156 LValue *t = new_LValue(func, FILE_GPR); 1157 bld.mkMov(t, i->getDef(0)); 1158 bld.mkMov(i->getDef(0), i->getDef(1)); 1159 bld.mkMov(i->getDef(1), t); 1160 } 1161 return true; 1162 } 1163 1164 bool 1165 NVC0LoweringPass::handleBUFQ(Instruction *bufq) 1166 { 1167 bufq->op = OP_MOV; 1168 bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1), 1169 bufq->getSrc(0)->reg.fileIndex * 16)); 1170 bufq->setIndirect(0, 0, NULL); 1171 bufq->setIndirect(0, 1, NULL); 1172 return true; 1173 } 1174 1175 void 1176 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom) 1177 { 1178 assert(atom->src(0).getFile() == FILE_MEMORY_SHARED); 1179 1180 BasicBlock *currBB = atom->bb; 1181 BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false); 1182 BasicBlock *joinBB = atom->bb->splitAfter(atom); 1183 BasicBlock *setAndUnlockBB = new BasicBlock(func); 1184 BasicBlock *failLockBB = new BasicBlock(func); 1185 1186 bld.setPosition(currBB, true); 1187 assert(!currBB->joinAt); 1188 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); 1189 1190 CmpInstruction *pred = 1191 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 1192 TYPE_U32, bld.mkImm(0), bld.mkImm(1)); 1193 1194 bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL); 1195 currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE); 1196 1197 bld.setPosition(tryLockBB, true); 1198 1199 Instruction *ld = 1200 bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(), 1201 atom->getIndirect(0, 0)); 1202 ld->setDef(1, bld.getSSA(1, FILE_PREDICATE)); 1203 ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED; 1204 1205 bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1)); 1206 bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); 1207 tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS); 1208 tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE); 1209 1210 tryLockBB->cfg.detach(&joinBB->cfg); 1211 bld.remove(atom); 1212 1213 bld.setPosition(setAndUnlockBB, true); 1214 Value *stVal; 1215 if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) { 1216 // Read the old value, and write the new one. 1217 stVal = atom->getSrc(1); 1218 } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) { 1219 CmpInstruction *set = 1220 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(), 1221 TYPE_U32, ld->getDef(0), atom->getSrc(1)); 1222 1223 bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()), 1224 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0)); 1225 } else { 1226 operation op; 1227 1228 switch (atom->subOp) { 1229 case NV50_IR_SUBOP_ATOM_ADD: 1230 op = OP_ADD; 1231 break; 1232 case NV50_IR_SUBOP_ATOM_AND: 1233 op = OP_AND; 1234 break; 1235 case NV50_IR_SUBOP_ATOM_OR: 1236 op = OP_OR; 1237 break; 1238 case NV50_IR_SUBOP_ATOM_XOR: 1239 op = OP_XOR; 1240 break; 1241 case NV50_IR_SUBOP_ATOM_MIN: 1242 op = OP_MIN; 1243 break; 1244 case NV50_IR_SUBOP_ATOM_MAX: 1245 op = OP_MAX; 1246 break; 1247 default: 1248 assert(0); 1249 return; 1250 } 1251 1252 stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0), 1253 atom->getSrc(1)); 1254 } 1255 1256 Instruction *st = 1257 bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(), 1258 atom->getIndirect(0, 0), stVal); 1259 st->setDef(0, pred->getDef(0)); 1260 st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED; 1261 1262 bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); 1263 setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE); 1264 1265 // Lock until the store has not been performed. 1266 bld.setPosition(failLockBB, true); 1267 bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0)); 1268 bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL); 1269 failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK); 1270 failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE); 1271 1272 bld.setPosition(joinBB, false); 1273 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; 1274 } 1275 1276 void 1277 NVC0LoweringPass::handleSharedATOM(Instruction *atom) 1278 { 1279 assert(atom->src(0).getFile() == FILE_MEMORY_SHARED); 1280 1281 BasicBlock *currBB = atom->bb; 1282 BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false); 1283 BasicBlock *joinBB = atom->bb->splitAfter(atom); 1284 1285 bld.setPosition(currBB, true); 1286 assert(!currBB->joinAt); 1287 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); 1288 1289 bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL); 1290 currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE); 1291 1292 bld.setPosition(tryLockAndSetBB, true); 1293 1294 Instruction *ld = 1295 bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(), 1296 atom->getIndirect(0, 0)); 1297 ld->setDef(1, bld.getSSA(1, FILE_PREDICATE)); 1298 ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED; 1299 1300 Value *stVal; 1301 if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) { 1302 // Read the old value, and write the new one. 1303 stVal = atom->getSrc(1); 1304 } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) { 1305 CmpInstruction *set = 1306 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 1307 TYPE_U32, ld->getDef(0), atom->getSrc(1)); 1308 set->setPredicate(CC_P, ld->getDef(1)); 1309 1310 Instruction *selp = 1311 bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0), 1312 atom->getSrc(2), set->getDef(0)); 1313 selp->src(2).mod = Modifier(NV50_IR_MOD_NOT); 1314 selp->setPredicate(CC_P, ld->getDef(1)); 1315 1316 stVal = selp->getDef(0); 1317 } else { 1318 operation op; 1319 1320 switch (atom->subOp) { 1321 case NV50_IR_SUBOP_ATOM_ADD: 1322 op = OP_ADD; 1323 break; 1324 case NV50_IR_SUBOP_ATOM_AND: 1325 op = OP_AND; 1326 break; 1327 case NV50_IR_SUBOP_ATOM_OR: 1328 op = OP_OR; 1329 break; 1330 case NV50_IR_SUBOP_ATOM_XOR: 1331 op = OP_XOR; 1332 break; 1333 case NV50_IR_SUBOP_ATOM_MIN: 1334 op = OP_MIN; 1335 break; 1336 case NV50_IR_SUBOP_ATOM_MAX: 1337 op = OP_MAX; 1338 break; 1339 default: 1340 assert(0); 1341 return; 1342 } 1343 1344 Instruction *i = 1345 bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0), 1346 atom->getSrc(1)); 1347 i->setPredicate(CC_P, ld->getDef(1)); 1348 1349 stVal = i->getDef(0); 1350 } 1351 1352 Instruction *st = 1353 bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(), 1354 atom->getIndirect(0, 0), stVal); 1355 st->setPredicate(CC_P, ld->getDef(1)); 1356 st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED; 1357 1358 // Loop until the lock is acquired. 1359 bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1)); 1360 tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK); 1361 tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS); 1362 bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL); 1363 1364 bld.remove(atom); 1365 1366 bld.setPosition(joinBB, false); 1367 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; 1368 } 1369 1370 bool 1371 NVC0LoweringPass::handleATOM(Instruction *atom) 1372 { 1373 SVSemantic sv; 1374 Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base; 1375 1376 switch (atom->src(0).getFile()) { 1377 case FILE_MEMORY_LOCAL: 1378 sv = SV_LBASE; 1379 break; 1380 case FILE_MEMORY_SHARED: 1381 // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic 1382 // operations on shared memory. For Maxwell, ATOMS is enough. 1383 if (targ->getChipset() < NVISA_GK104_CHIPSET) 1384 handleSharedATOM(atom); 1385 else if (targ->getChipset() < NVISA_GM107_CHIPSET) 1386 handleSharedATOMNVE4(atom); 1387 return true; 1388 default: 1389 assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER); 1390 base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16); 1391 assert(base->reg.size == 8); 1392 if (ptr) 1393 base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr); 1394 assert(base->reg.size == 8); 1395 atom->setIndirect(0, 0, base); 1396 atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; 1397 1398 // Harden against out-of-bounds accesses 1399 Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType)); 1400 Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16); 1401 Value *pred = new_LValue(func, FILE_PREDICATE); 1402 if (ptr) 1403 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr); 1404 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); 1405 atom->setPredicate(CC_NOT_P, pred); 1406 if (atom->defExists(0)) { 1407 Value *zero, *dst = atom->getDef(0); 1408 atom->setDef(0, bld.getSSA()); 1409 1410 bld.setPosition(atom, true); 1411 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0)) 1412 ->setPredicate(CC_P, pred); 1413 bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero); 1414 } 1415 1416 return true; 1417 } 1418 base = 1419 bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0)); 1420 1421 atom->setSrc(0, cloneShallow(func, atom->getSrc(0))); 1422 atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; 1423 if (ptr) 1424 base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr); 1425 atom->setIndirect(0, 1, NULL); 1426 atom->setIndirect(0, 0, base); 1427 1428 return true; 1429 } 1430 1431 bool 1432 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl) 1433 { 1434 if (targ->getChipset() < NVISA_GM107_CHIPSET) { 1435 if (cas->src(0).getFile() == FILE_MEMORY_SHARED) { 1436 // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM(). 1437 return false; 1438 } 1439 } 1440 1441 if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS && 1442 cas->subOp != NV50_IR_SUBOP_ATOM_EXCH) 1443 return false; 1444 bld.setPosition(cas, true); 1445 1446 if (needCctl) { 1447 Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0)); 1448 cctl->setIndirect(0, 0, cas->getIndirect(0, 0)); 1449 cctl->fixed = 1; 1450 cctl->subOp = NV50_IR_SUBOP_CCTL_IV; 1451 if (cas->isPredicated()) 1452 cctl->setPredicate(cas->cc, cas->getPredicate()); 1453 } 1454 1455 if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) { 1456 // CAS is crazy. It's 2nd source is a double reg, and the 3rd source 1457 // should be set to the high part of the double reg or bad things will 1458 // happen elsewhere in the universe. 1459 // Also, it sometimes returns the new value instead of the old one 1460 // under mysterious circumstances. 1461 Value *dreg = bld.getSSA(8); 1462 bld.setPosition(cas, false); 1463 bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2)); 1464 cas->setSrc(1, dreg); 1465 cas->setSrc(2, dreg); 1466 } 1467 1468 return true; 1469 } 1470 1471 inline Value * 1472 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base) 1473 { 1474 uint8_t b = prog->driver->io.auxCBSlot; 1475 off += base; 1476 1477 return bld. 1478 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); 1479 } 1480 1481 inline Value * 1482 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base) 1483 { 1484 uint8_t b = prog->driver->io.auxCBSlot; 1485 off += base; 1486 1487 if (ptr) 1488 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4)); 1489 1490 return bld. 1491 mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr); 1492 } 1493 1494 inline Value * 1495 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base) 1496 { 1497 uint8_t b = prog->driver->io.auxCBSlot; 1498 off += base; 1499 1500 if (ptr) 1501 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4)); 1502 1503 return bld. 1504 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr); 1505 } 1506 1507 inline Value * 1508 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off) 1509 { 1510 return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase); 1511 } 1512 1513 inline Value * 1514 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off) 1515 { 1516 return loadResLength32(ptr, off, prog->driver->io.bufInfoBase); 1517 } 1518 1519 inline Value * 1520 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off) 1521 { 1522 return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase); 1523 } 1524 1525 inline Value * 1526 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off) 1527 { 1528 return loadResLength32(ptr, off, prog->driver->io.uboInfoBase); 1529 } 1530 1531 inline Value * 1532 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off) 1533 { 1534 uint8_t b = prog->driver->io.msInfoCBSlot; 1535 off += prog->driver->io.msInfoBase; 1536 return bld. 1537 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); 1538 } 1539 1540 /* On nvc0, surface info is obtained via the surface binding points passed 1541 * to the SULD/SUST instructions. 1542 * On nve4, surface info is stored in c[] and is used by various special 1543 * instructions, e.g. for clamping coordinates or generating an address. 1544 * They couldn't just have added an equivalent to TIC now, couldn't they ? 1545 */ 1546 #define NVC0_SU_INFO_ADDR 0x00 1547 #define NVC0_SU_INFO_FMT 0x04 1548 #define NVC0_SU_INFO_DIM_X 0x08 1549 #define NVC0_SU_INFO_PITCH 0x0c 1550 #define NVC0_SU_INFO_DIM_Y 0x10 1551 #define NVC0_SU_INFO_ARRAY 0x14 1552 #define NVC0_SU_INFO_DIM_Z 0x18 1553 #define NVC0_SU_INFO_UNK1C 0x1c 1554 #define NVC0_SU_INFO_WIDTH 0x20 1555 #define NVC0_SU_INFO_HEIGHT 0x24 1556 #define NVC0_SU_INFO_DEPTH 0x28 1557 #define NVC0_SU_INFO_TARGET 0x2c 1558 #define NVC0_SU_INFO_BSIZE 0x30 1559 #define NVC0_SU_INFO_RAW_X 0x34 1560 #define NVC0_SU_INFO_MS_X 0x38 1561 #define NVC0_SU_INFO_MS_Y 0x3c 1562 1563 #define NVC0_SU_INFO__STRIDE 0x40 1564 1565 #define NVC0_SU_INFO_DIM(i) (0x08 + (i) * 8) 1566 #define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4) 1567 #define NVC0_SU_INFO_MS(i) (0x38 + (i) * 4) 1568 1569 inline Value * 1570 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off) 1571 { 1572 uint32_t base = slot * NVC0_SU_INFO__STRIDE; 1573 1574 if (ptr) { 1575 ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot)); 1576 ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7)); 1577 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6)); 1578 base = 0; 1579 } 1580 off += base; 1581 1582 return loadResInfo32(ptr, off, prog->driver->io.suInfoBase); 1583 } 1584 1585 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c) 1586 { 1587 switch (su->tex.target.getEnum()) { 1588 case TEX_TARGET_BUFFER: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1); 1589 case TEX_TARGET_RECT: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1590 case TEX_TARGET_1D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1591 case TEX_TARGET_1D_ARRAY: return (c == 1) ? 1592 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) : 1593 NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1594 case TEX_TARGET_2D: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2); 1595 case TEX_TARGET_2D_MS: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2); 1596 case TEX_TARGET_2D_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1597 case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1598 case TEX_TARGET_3D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1599 case TEX_TARGET_CUBE: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1600 case TEX_TARGET_CUBE_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2); 1601 default: 1602 assert(0); 1603 return 0; 1604 } 1605 } 1606 1607 bool 1608 NVC0LoweringPass::handleSUQ(TexInstruction *suq) 1609 { 1610 int mask = suq->tex.mask; 1611 int dim = suq->tex.target.getDim(); 1612 int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube()); 1613 Value *ind = suq->getIndirectR(); 1614 int slot = suq->tex.r; 1615 int c, d; 1616 1617 for (c = 0, d = 0; c < 3; ++c, mask >>= 1) { 1618 if (c >= arg || !(mask & 1)) 1619 continue; 1620 1621 int offset; 1622 1623 if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) { 1624 offset = NVC0_SU_INFO_SIZE(2); 1625 } else { 1626 offset = NVC0_SU_INFO_SIZE(c); 1627 } 1628 bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset)); 1629 if (c == 2 && suq->tex.target.isCube()) 1630 bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1), 1631 bld.loadImm(NULL, 6)); 1632 } 1633 1634 if (mask & 1) { 1635 if (suq->tex.target.isMS()) { 1636 Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0)); 1637 Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1)); 1638 Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y); 1639 bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms); 1640 } else { 1641 bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1)); 1642 } 1643 } 1644 1645 bld.remove(suq); 1646 return true; 1647 } 1648 1649 void 1650 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex) 1651 { 1652 const int arg = tex->tex.target.getArgCount(); 1653 int slot = tex->tex.r; 1654 1655 if (tex->tex.target == TEX_TARGET_2D_MS) 1656 tex->tex.target = TEX_TARGET_2D; 1657 else 1658 if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY) 1659 tex->tex.target = TEX_TARGET_2D_ARRAY; 1660 else 1661 return; 1662 1663 Value *x = tex->getSrc(0); 1664 Value *y = tex->getSrc(1); 1665 Value *s = tex->getSrc(arg - 1); 1666 1667 Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA(); 1668 Value *ind = tex->getIndirectR(); 1669 1670 Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0)); 1671 Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1)); 1672 1673 bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x); 1674 bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y); 1675 1676 s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7)); 1677 s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3)); 1678 1679 Value *dx = loadMsInfo32(ts, 0x0); 1680 Value *dy = loadMsInfo32(ts, 0x4); 1681 1682 bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx); 1683 bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy); 1684 1685 tex->setSrc(0, tx); 1686 tex->setSrc(1, ty); 1687 tex->moveSources(arg, -1); 1688 } 1689 1690 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST. 1691 // They're computed from the coordinates using the surface info in c[] space. 1692 void 1693 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) 1694 { 1695 Instruction *insn; 1696 const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP; 1697 const bool raw = 1698 su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB; 1699 const int slot = su->tex.r; 1700 const int dim = su->tex.target.getDim(); 1701 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 1702 int c; 1703 Value *zero = bld.mkImm(0); 1704 Value *p1 = NULL; 1705 Value *v; 1706 Value *src[3]; 1707 Value *bf, *eau, *off; 1708 Value *addr, *pred; 1709 Value *ind = su->getIndirectR(); 1710 1711 off = bld.getScratch(4); 1712 bf = bld.getScratch(4); 1713 addr = bld.getSSA(8); 1714 pred = bld.getScratch(1, FILE_PREDICATE); 1715 1716 bld.setPosition(su, false); 1717 1718 adjustCoordinatesMS(su); 1719 1720 // calculate clamped coordinates 1721 for (c = 0; c < arg; ++c) { 1722 int dimc = c; 1723 1724 if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) { 1725 // The array index is stored in the Z component for 1D arrays. 1726 dimc = 2; 1727 } 1728 1729 src[c] = bld.getScratch(); 1730 if (c == 0 && raw) 1731 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X); 1732 else 1733 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc)); 1734 bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero) 1735 ->subOp = getSuClampSubOp(su, dimc); 1736 } 1737 for (; c < 3; ++c) 1738 src[c] = zero; 1739 1740 // set predicate output 1741 if (su->tex.target == TEX_TARGET_BUFFER) { 1742 src[0]->getInsn()->setFlagsDef(1, pred); 1743 } else 1744 if (su->tex.target.isArray() || su->tex.target.isCube()) { 1745 p1 = bld.getSSA(1, FILE_PREDICATE); 1746 src[dim]->getInsn()->setFlagsDef(1, p1); 1747 } 1748 1749 // calculate pixel offset 1750 if (dim == 1) { 1751 if (su->tex.target != TEX_TARGET_BUFFER) 1752 bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff)); 1753 } else 1754 if (dim == 3) { 1755 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C); 1756 bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1]) 1757 ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l 1758 1759 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH); 1760 bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0]) 1761 ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l 1762 } else { 1763 assert(dim == 2); 1764 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH); 1765 bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0]) 1766 ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ? 1767 NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l 1768 } 1769 1770 // calculate effective address part 1 1771 if (su->tex.target == TEX_TARGET_BUFFER) { 1772 if (raw) { 1773 bf = src[0]; 1774 } else { 1775 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT); 1776 bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero) 1777 ->subOp = NV50_IR_SUBOP_V1(7,6,8|2); 1778 } 1779 } else { 1780 Value *y = src[1]; 1781 Value *z = src[2]; 1782 uint16_t subOp = 0; 1783 1784 switch (dim) { 1785 case 1: 1786 y = zero; 1787 z = zero; 1788 break; 1789 case 2: 1790 z = off; 1791 if (!su->tex.target.isArray() && !su->tex.target.isCube()) { 1792 z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C); 1793 subOp = NV50_IR_SUBOP_SUBFM_3D; 1794 } 1795 break; 1796 default: 1797 subOp = NV50_IR_SUBOP_SUBFM_3D; 1798 assert(dim == 3); 1799 break; 1800 } 1801 insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z); 1802 insn->subOp = subOp; 1803 insn->setFlagsDef(1, pred); 1804 } 1805 1806 // part 2 1807 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR); 1808 1809 if (su->tex.target == TEX_TARGET_BUFFER) { 1810 eau = v; 1811 } else { 1812 eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v); 1813 } 1814 // add array layer offset 1815 if (su->tex.target.isArray() || su->tex.target.isCube()) { 1816 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY); 1817 if (dim == 1) 1818 bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau) 1819 ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32 1820 else 1821 bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau) 1822 ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32 1823 // combine predicates 1824 assert(p1); 1825 bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1); 1826 } 1827 1828 if (atom) { 1829 Value *lo = bf; 1830 if (su->tex.target == TEX_TARGET_BUFFER) { 1831 lo = zero; 1832 bld.mkMov(off, bf); 1833 } 1834 // bf == g[] address & 0xff 1835 // eau == g[] address >> 8 1836 bld.mkOp3(OP_PERMT, TYPE_U32, bf, lo, bld.loadImm(NULL, 0x6540), eau); 1837 bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau); 1838 } else 1839 if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) { 1840 // Convert from u32 to u8 address format, which is what the library code 1841 // doing SULDP currently uses. 1842 // XXX: can SUEAU do this ? 1843 // XXX: does it matter that we don't mask high bytes in bf ? 1844 // Grrr. 1845 bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8)); 1846 bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off); 1847 } 1848 1849 bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau); 1850 1851 if (atom && su->tex.target == TEX_TARGET_BUFFER) 1852 bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off); 1853 1854 // let's just set it 0 for raw access and hope it works 1855 v = raw ? 1856 bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT); 1857 1858 // get rid of old coordinate sources, make space for fmt info and predicate 1859 su->moveSources(arg, 3 - arg); 1860 // set 64 bit address and 32-bit format sources 1861 su->setSrc(0, addr); 1862 su->setSrc(1, v); 1863 su->setSrc(2, pred); 1864 1865 // prevent read fault when the image is not actually bound 1866 CmpInstruction *pred1 = 1867 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 1868 TYPE_U32, bld.mkImm(0), 1869 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR)); 1870 1871 if (su->op != OP_SUSTP && su->tex.format) { 1872 const TexInstruction::ImgFormatDesc *format = su->tex.format; 1873 int blockwidth = format->bits[0] + format->bits[1] + 1874 format->bits[2] + format->bits[3]; 1875 1876 // make sure that the format doesn't mismatch 1877 assert(format->components != 0); 1878 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0), 1879 TYPE_U32, bld.loadImm(NULL, blockwidth / 8), 1880 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE), 1881 pred1->getDef(0)); 1882 } 1883 su->setPredicate(CC_NOT_P, pred1->getDef(0)); 1884 1885 // TODO: initialize def values to 0 when the surface operation is not 1886 // performed (not needed for stores). Also, fix the "address bounds test" 1887 // subtests from arb_shader_image_load_store-invalid for buffers, because it 1888 // seems like that the predicate is not correctly set by suclamp. 1889 } 1890 1891 static DataType 1892 getSrcType(const TexInstruction::ImgFormatDesc *t, int c) 1893 { 1894 switch (t->type) { 1895 case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32; 1896 case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16; 1897 case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16; 1898 case UINT: 1899 return (t->bits[c] == 8 ? TYPE_U8 : 1900 (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32)); 1901 case SINT: 1902 return (t->bits[c] == 8 ? TYPE_S8 : 1903 (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32)); 1904 } 1905 return TYPE_NONE; 1906 } 1907 1908 static DataType 1909 getDestType(const ImgType type) { 1910 switch (type) { 1911 case FLOAT: 1912 case UNORM: 1913 case SNORM: 1914 return TYPE_F32; 1915 case UINT: 1916 return TYPE_U32; 1917 case SINT: 1918 return TYPE_S32; 1919 default: 1920 assert(!"Impossible type"); 1921 return TYPE_NONE; 1922 } 1923 } 1924 1925 void 1926 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su) 1927 { 1928 const TexInstruction::ImgFormatDesc *format = su->tex.format; 1929 int width = format->bits[0] + format->bits[1] + 1930 format->bits[2] + format->bits[3]; 1931 Value *untypedDst[4] = {}; 1932 Value *typedDst[4] = {}; 1933 1934 // We must convert this to a generic load. 1935 su->op = OP_SULDB; 1936 1937 su->dType = typeOfSize(width / 8); 1938 su->sType = TYPE_U8; 1939 1940 for (int i = 0; i < width / 32; i++) 1941 untypedDst[i] = bld.getSSA(); 1942 if (width < 32) 1943 untypedDst[0] = bld.getSSA(); 1944 1945 for (int i = 0; i < 4; i++) { 1946 typedDst[i] = su->getDef(i); 1947 } 1948 1949 // Set the untyped dsts as the su's destinations 1950 for (int i = 0; i < 4; i++) 1951 su->setDef(i, untypedDst[i]); 1952 1953 bld.setPosition(su, true); 1954 1955 // Unpack each component into the typed dsts 1956 int bits = 0; 1957 for (int i = 0; i < 4; bits += format->bits[i], i++) { 1958 if (!typedDst[i]) 1959 continue; 1960 if (i >= format->components) { 1961 if (format->type == FLOAT || 1962 format->type == UNORM || 1963 format->type == SNORM) 1964 bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f); 1965 else 1966 bld.loadImm(typedDst[i], i == 3 ? 1 : 0); 1967 continue; 1968 } 1969 1970 // Get just that component's data into the relevant place 1971 if (format->bits[i] == 32) 1972 bld.mkMov(typedDst[i], untypedDst[i]); 1973 else if (format->bits[i] == 16) 1974 bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i], 1975 getSrcType(format, i), untypedDst[i / 2]) 1976 ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1); 1977 else if (format->bits[i] == 8) 1978 bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i], 1979 getSrcType(format, i), untypedDst[0])->subOp = i; 1980 else { 1981 bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32], 1982 bld.mkImm((bits % 32) | (format->bits[i] << 8))); 1983 if (format->type == UNORM || format->type == SNORM) 1984 bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]); 1985 } 1986 1987 // Normalize / convert as necessary 1988 if (format->type == UNORM) 1989 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1))); 1990 else if (format->type == SNORM) 1991 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1))); 1992 else if (format->type == FLOAT && format->bits[i] < 16) { 1993 bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i])); 1994 bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]); 1995 } 1996 } 1997 1998 if (format->bgra) { 1999 std::swap(typedDst[0], typedDst[2]); 2000 } 2001 } 2002 2003 void 2004 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su) 2005 { 2006 processSurfaceCoordsNVE4(su); 2007 2008 if (su->op == OP_SULDP) 2009 convertSurfaceFormat(su); 2010 2011 if (su->op == OP_SUREDB || su->op == OP_SUREDP) { 2012 assert(su->getPredicate()); 2013 Value *pred = 2014 bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE), 2015 su->getPredicate(), su->getSrc(2)); 2016 2017 Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA()); 2018 red->subOp = su->subOp; 2019 red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0)); 2020 red->setSrc(1, su->getSrc(3)); 2021 if (su->subOp == NV50_IR_SUBOP_ATOM_CAS) 2022 red->setSrc(2, su->getSrc(4)); 2023 red->setIndirect(0, 0, su->getSrc(0)); 2024 2025 // make sure to initialize dst value when the atomic operation is not 2026 // performed 2027 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); 2028 2029 assert(su->cc == CC_NOT_P); 2030 red->setPredicate(su->cc, pred); 2031 mov->setPredicate(CC_P, pred); 2032 2033 bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0), 2034 red->getDef(0), mov->getDef(0)); 2035 2036 delete_Instruction(bld.getProgram(), su); 2037 handleCasExch(red, true); 2038 } 2039 2040 if (su->op == OP_SUSTB || su->op == OP_SUSTP) 2041 su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8; 2042 } 2043 2044 void 2045 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su) 2046 { 2047 const int slot = su->tex.r; 2048 const int dim = su->tex.target.getDim(); 2049 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 2050 int c; 2051 Value *zero = bld.mkImm(0); 2052 Value *src[3]; 2053 Value *v; 2054 Value *ind = su->getIndirectR(); 2055 2056 bld.setPosition(su, false); 2057 2058 adjustCoordinatesMS(su); 2059 2060 if (ind) { 2061 Value *ptr; 2062 ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r)); 2063 ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7)); 2064 su->setIndirectR(ptr); 2065 } 2066 2067 // get surface coordinates 2068 for (c = 0; c < arg; ++c) 2069 src[c] = su->getSrc(c); 2070 for (; c < 3; ++c) 2071 src[c] = zero; 2072 2073 // calculate pixel offset 2074 if (su->op == OP_SULDP || su->op == OP_SUREDP) { 2075 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE); 2076 su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v)); 2077 } 2078 2079 // add array layer offset 2080 if (su->tex.target.isArray() || su->tex.target.isCube()) { 2081 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY); 2082 assert(dim > 1); 2083 su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v)); 2084 } 2085 2086 // prevent read fault when the image is not actually bound 2087 CmpInstruction *pred = 2088 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 2089 TYPE_U32, bld.mkImm(0), 2090 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR)); 2091 if (su->op != OP_SUSTP && su->tex.format) { 2092 const TexInstruction::ImgFormatDesc *format = su->tex.format; 2093 int blockwidth = format->bits[0] + format->bits[1] + 2094 format->bits[2] + format->bits[3]; 2095 2096 assert(format->components != 0); 2097 // make sure that the format doesn't mismatch when it's not FMT_NONE 2098 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), 2099 TYPE_U32, bld.loadImm(NULL, blockwidth / 8), 2100 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE), 2101 pred->getDef(0)); 2102 } 2103 su->setPredicate(CC_NOT_P, pred->getDef(0)); 2104 } 2105 2106 void 2107 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su) 2108 { 2109 if (su->tex.target == TEX_TARGET_1D_ARRAY) { 2110 /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY 2111 * will simplify the lowering pass and the texture constraints. */ 2112 su->moveSources(1, 1); 2113 su->setSrc(1, bld.loadImm(NULL, 0)); 2114 su->tex.target = TEX_TARGET_2D_ARRAY; 2115 } 2116 2117 processSurfaceCoordsNVC0(su); 2118 2119 if (su->op == OP_SULDP) 2120 convertSurfaceFormat(su); 2121 2122 if (su->op == OP_SUREDB || su->op == OP_SUREDP) { 2123 const int dim = su->tex.target.getDim(); 2124 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 2125 LValue *addr = bld.getSSA(8); 2126 Value *def = su->getDef(0); 2127 2128 su->op = OP_SULEA; 2129 2130 // Set the destination to the address 2131 su->dType = TYPE_U64; 2132 su->setDef(0, addr); 2133 su->setDef(1, su->getPredicate()); 2134 2135 bld.setPosition(su, true); 2136 2137 // Perform the atomic op 2138 Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA()); 2139 red->subOp = su->subOp; 2140 red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0)); 2141 red->setSrc(1, su->getSrc(arg)); 2142 if (red->subOp == NV50_IR_SUBOP_ATOM_CAS) 2143 red->setSrc(2, su->getSrc(arg + 1)); 2144 red->setIndirect(0, 0, addr); 2145 2146 // make sure to initialize dst value when the atomic operation is not 2147 // performed 2148 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); 2149 2150 assert(su->cc == CC_NOT_P); 2151 red->setPredicate(su->cc, su->getPredicate()); 2152 mov->setPredicate(CC_P, su->getPredicate()); 2153 2154 bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0)); 2155 2156 handleCasExch(red, false); 2157 } 2158 } 2159 2160 void 2161 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su) 2162 { 2163 const int slot = su->tex.r; 2164 const int dim = su->tex.target.getDim(); 2165 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); 2166 Value *ind = su->getIndirectR(); 2167 int pos = 0; 2168 2169 bld.setPosition(su, false); 2170 2171 // add texture handle 2172 switch (su->op) { 2173 case OP_SUSTP: 2174 pos = 4; 2175 break; 2176 case OP_SUREDP: 2177 pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1; 2178 break; 2179 default: 2180 assert(pos == 0); 2181 break; 2182 } 2183 su->setSrc(arg + pos, loadTexHandle(ind, slot + 32)); 2184 2185 // prevent read fault when the image is not actually bound 2186 CmpInstruction *pred = 2187 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), 2188 TYPE_U32, bld.mkImm(0), 2189 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR)); 2190 if (su->op != OP_SUSTP && su->tex.format) { 2191 const TexInstruction::ImgFormatDesc *format = su->tex.format; 2192 int blockwidth = format->bits[0] + format->bits[1] + 2193 format->bits[2] + format->bits[3]; 2194 2195 assert(format->components != 0); 2196 // make sure that the format doesn't mismatch when it's not FMT_NONE 2197 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), 2198 TYPE_U32, bld.loadImm(NULL, blockwidth / 8), 2199 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE), 2200 pred->getDef(0)); 2201 } 2202 su->setPredicate(CC_NOT_P, pred->getDef(0)); 2203 } 2204 2205 void 2206 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su) 2207 { 2208 processSurfaceCoordsGM107(su); 2209 2210 if (su->op == OP_SULDP) 2211 convertSurfaceFormat(su); 2212 2213 if (su->op == OP_SUREDP) { 2214 Value *def = su->getDef(0); 2215 2216 su->op = OP_SUREDB; 2217 su->setDef(0, bld.getSSA()); 2218 2219 bld.setPosition(su, true); 2220 2221 // make sure to initialize dst value when the atomic operation is not 2222 // performed 2223 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); 2224 2225 assert(su->cc == CC_NOT_P); 2226 mov->setPredicate(CC_P, su->getPredicate()); 2227 2228 bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0)); 2229 } 2230 } 2231 2232 bool 2233 NVC0LoweringPass::handleWRSV(Instruction *i) 2234 { 2235 Instruction *st; 2236 Symbol *sym; 2237 uint32_t addr; 2238 2239 // must replace, $sreg are not writeable 2240 addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym()); 2241 if (addr >= 0x400) 2242 return false; 2243 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); 2244 2245 st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), 2246 i->getSrc(1)); 2247 st->perPatch = i->perPatch; 2248 2249 bld.getBB()->remove(i); 2250 return true; 2251 } 2252 2253 void 2254 NVC0LoweringPass::handleLDST(Instruction *i) 2255 { 2256 if (i->src(0).getFile() == FILE_SHADER_INPUT) { 2257 if (prog->getType() == Program::TYPE_COMPUTE) { 2258 i->getSrc(0)->reg.file = FILE_MEMORY_CONST; 2259 i->getSrc(0)->reg.fileIndex = 0; 2260 } else 2261 if (prog->getType() == Program::TYPE_GEOMETRY && 2262 i->src(0).isIndirect(0)) { 2263 // XXX: this assumes vec4 units 2264 Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 2265 i->getIndirect(0, 0), bld.mkImm(4)); 2266 i->setIndirect(0, 0, ptr); 2267 i->op = OP_VFETCH; 2268 } else { 2269 i->op = OP_VFETCH; 2270 assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP 2271 } 2272 } else if (i->src(0).getFile() == FILE_MEMORY_CONST) { 2273 if (targ->getChipset() >= NVISA_GK104_CHIPSET && 2274 prog->getType() == Program::TYPE_COMPUTE) { 2275 // The launch descriptor only allows to set up 8 CBs, but OpenGL 2276 // requires at least 12 UBOs. To bypass this limitation, we store the 2277 // addrs into the driver constbuf and we directly load from the global 2278 // memory. 2279 int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1; 2280 Value *ind = i->getIndirect(0, 1); 2281 2282 if (!ind && fileIndex == -1) 2283 return; 2284 2285 if (ind) { 2286 // Clamp the UBO index when an indirect access is used to avoid 2287 // loading information from the wrong place in the driver cb. 2288 // TODO - synchronize the max with the driver. 2289 ind = bld.mkOp2v(OP_MIN, TYPE_U32, ind, 2290 bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), 2291 ind, bld.loadImm(NULL, fileIndex)), 2292 bld.loadImm(NULL, 13)); 2293 fileIndex = 0; 2294 } 2295 2296 Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType)); 2297 Value *ptr = loadUboInfo64(ind, fileIndex * 16); 2298 Value *length = loadUboLength32(ind, fileIndex * 16); 2299 Value *pred = new_LValue(func, FILE_PREDICATE); 2300 if (i->src(0).isIndirect(0)) { 2301 bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0)); 2302 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0)); 2303 } 2304 i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; 2305 i->setIndirect(0, 1, NULL); 2306 i->setIndirect(0, 0, ptr); 2307 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); 2308 i->setPredicate(CC_NOT_P, pred); 2309 Value *zero, *dst = i->getDef(0); 2310 i->setDef(0, bld.getSSA()); 2311 2312 bld.setPosition(i, true); 2313 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0)) 2314 ->setPredicate(CC_P, pred); 2315 bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero); 2316 } else if (i->src(0).isIndirect(1)) { 2317 Value *ptr; 2318 if (i->src(0).isIndirect(0)) 2319 ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(), 2320 i->getIndirect(0, 1), bld.mkImm(0x1010), 2321 i->getIndirect(0, 0)); 2322 else 2323 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 2324 i->getIndirect(0, 1), bld.mkImm(16)); 2325 i->setIndirect(0, 1, NULL); 2326 i->setIndirect(0, 0, ptr); 2327 i->subOp = NV50_IR_SUBOP_LDC_IS; 2328 } 2329 } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) { 2330 assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL); 2331 i->op = OP_VFETCH; 2332 } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) { 2333 Value *ind = i->getIndirect(0, 1); 2334 Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16); 2335 // XXX come up with a way not to do this for EVERY little access but 2336 // rather to batch these up somehow. Unfortunately we've lost the 2337 // information about the field width by the time we get here. 2338 Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType)); 2339 Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16); 2340 Value *pred = new_LValue(func, FILE_PREDICATE); 2341 if (i->src(0).isIndirect(0)) { 2342 bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0)); 2343 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0)); 2344 } 2345 i->setIndirect(0, 1, NULL); 2346 i->setIndirect(0, 0, ptr); 2347 i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; 2348 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); 2349 i->setPredicate(CC_NOT_P, pred); 2350 if (i->defExists(0)) { 2351 Value *zero, *dst = i->getDef(0); 2352 i->setDef(0, bld.getSSA()); 2353 2354 bld.setPosition(i, true); 2355 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0)) 2356 ->setPredicate(CC_P, pred); 2357 bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero); 2358 } 2359 } 2360 } 2361 2362 void 2363 NVC0LoweringPass::readTessCoord(LValue *dst, int c) 2364 { 2365 Value *laneid = bld.getSSA(); 2366 Value *x, *y; 2367 2368 bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0)); 2369 2370 if (c == 0) { 2371 x = dst; 2372 y = NULL; 2373 } else 2374 if (c == 1) { 2375 x = NULL; 2376 y = dst; 2377 } else { 2378 assert(c == 2); 2379 if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) { 2380 bld.mkMov(dst, bld.loadImm(NULL, 0)); 2381 return; 2382 } 2383 x = bld.getSSA(); 2384 y = bld.getSSA(); 2385 } 2386 if (x) 2387 bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid); 2388 if (y) 2389 bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid); 2390 2391 if (c == 2) { 2392 bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y); 2393 bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst); 2394 } 2395 } 2396 2397 bool 2398 NVC0LoweringPass::handleRDSV(Instruction *i) 2399 { 2400 Symbol *sym = i->getSrc(0)->asSym(); 2401 const SVSemantic sv = sym->reg.data.sv.sv; 2402 Value *vtx = NULL; 2403 Instruction *ld; 2404 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); 2405 2406 if (addr >= 0x400) { 2407 // mov $sreg 2408 if (sym->reg.data.sv.index == 3) { 2409 // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID 2410 i->op = OP_MOV; 2411 i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0)); 2412 } 2413 if (sv == SV_VERTEX_COUNT) { 2414 bld.setPosition(i, true); 2415 bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808)); 2416 } 2417 return true; 2418 } 2419 2420 switch (sv) { 2421 case SV_POSITION: 2422 assert(prog->getType() == Program::TYPE_FRAGMENT); 2423 if (i->srcExists(1)) { 2424 // Pass offset through to the interpolation logic 2425 ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET, 2426 i->getDef(0), addr, NULL); 2427 ld->setSrc(1, i->getSrc(1)); 2428 } else { 2429 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL); 2430 } 2431 break; 2432 case SV_FACE: 2433 { 2434 Value *face = i->getDef(0); 2435 bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL); 2436 if (i->dType == TYPE_F32) { 2437 bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001)); 2438 bld.mkOp1(OP_NEG, TYPE_S32, face, face); 2439 bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face); 2440 } 2441 } 2442 break; 2443 case SV_TESS_COORD: 2444 assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL); 2445 readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index); 2446 break; 2447 case SV_NTID: 2448 case SV_NCTAID: 2449 case SV_GRIDID: 2450 assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise 2451 if (sym->reg.data.sv.index == 3) { 2452 i->op = OP_MOV; 2453 i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1)); 2454 return true; 2455 } 2456 // Fallthrough 2457 case SV_WORK_DIM: 2458 addr += prog->driver->prop.cp.gridInfoBase; 2459 bld.mkLoad(TYPE_U32, i->getDef(0), 2460 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, 2461 TYPE_U32, addr), NULL); 2462 break; 2463 case SV_SAMPLE_INDEX: 2464 // TODO: Properly pass source as an address in the PIX address space 2465 // (which can be of the form [r0+offset]). But this is currently 2466 // unnecessary. 2467 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0)); 2468 ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID; 2469 break; 2470 case SV_SAMPLE_POS: { 2471 Value *off = new_LValue(func, FILE_GPR); 2472 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0)); 2473 ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID; 2474 bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3)); 2475 bld.mkLoad(TYPE_F32, 2476 i->getDef(0), 2477 bld.mkSymbol( 2478 FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, 2479 TYPE_U32, prog->driver->io.sampleInfoBase + 2480 4 * sym->reg.data.sv.index), 2481 off); 2482 break; 2483 } 2484 case SV_SAMPLE_MASK: { 2485 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0)); 2486 ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK; 2487 Instruction *sampleid = 2488 bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0)); 2489 sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID; 2490 Value *masked = 2491 bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0), 2492 bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 2493 bld.loadImm(NULL, 1), sampleid->getDef(0))); 2494 if (prog->driver->prop.fp.persampleInvocation) { 2495 bld.mkMov(i->getDef(0), masked); 2496 } else { 2497 bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked, 2498 bld.mkImm(0)) 2499 ->subOp = 1; 2500 } 2501 break; 2502 } 2503 case SV_BASEVERTEX: 2504 case SV_BASEINSTANCE: 2505 case SV_DRAWID: 2506 ld = bld.mkLoad(TYPE_U32, i->getDef(0), 2507 bld.mkSymbol(FILE_MEMORY_CONST, 2508 prog->driver->io.auxCBSlot, 2509 TYPE_U32, 2510 prog->driver->io.drawInfoBase + 2511 4 * (sv - SV_BASEVERTEX)), 2512 NULL); 2513 break; 2514 default: 2515 if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch) 2516 vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0)); 2517 if (prog->getType() == Program::TYPE_FRAGMENT) { 2518 bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL); 2519 } else { 2520 ld = bld.mkFetch(i->getDef(0), i->dType, 2521 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx); 2522 ld->perPatch = i->perPatch; 2523 } 2524 break; 2525 } 2526 bld.getBB()->remove(i); 2527 return true; 2528 } 2529 2530 bool 2531 NVC0LoweringPass::handleDIV(Instruction *i) 2532 { 2533 if (!isFloatType(i->dType)) 2534 return true; 2535 bld.setPosition(i, false); 2536 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1)); 2537 i->op = OP_MUL; 2538 i->setSrc(1, rcp->getDef(0)); 2539 return true; 2540 } 2541 2542 bool 2543 NVC0LoweringPass::handleMOD(Instruction *i) 2544 { 2545 if (!isFloatType(i->dType)) 2546 return true; 2547 LValue *value = bld.getScratch(typeSizeof(i->dType)); 2548 bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1)); 2549 bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value); 2550 bld.mkOp1(OP_TRUNC, i->dType, value, value); 2551 bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value); 2552 i->op = OP_SUB; 2553 i->setSrc(1, value); 2554 return true; 2555 } 2556 2557 bool 2558 NVC0LoweringPass::handleSQRT(Instruction *i) 2559 { 2560 if (i->dType == TYPE_F64) { 2561 Value *pred = bld.getSSA(1, FILE_PREDICATE); 2562 Value *zero = bld.loadImm(NULL, 0.0); 2563 Value *dst = bld.getSSA(8); 2564 bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0)); 2565 bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero); 2566 bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred); 2567 i->op = OP_MUL; 2568 i->setSrc(1, dst); 2569 // TODO: Handle this properly with a library function 2570 } else { 2571 bld.setPosition(i, true); 2572 i->op = OP_RSQ; 2573 bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); 2574 } 2575 2576 return true; 2577 } 2578 2579 bool 2580 NVC0LoweringPass::handlePOW(Instruction *i) 2581 { 2582 LValue *val = bld.getScratch(); 2583 2584 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); 2585 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; 2586 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); 2587 2588 i->op = OP_EX2; 2589 i->setSrc(0, val); 2590 i->setSrc(1, NULL); 2591 2592 return true; 2593 } 2594 2595 bool 2596 NVC0LoweringPass::handleEXPORT(Instruction *i) 2597 { 2598 if (prog->getType() == Program::TYPE_FRAGMENT) { 2599 int id = i->getSrc(0)->reg.data.offset / 4; 2600 2601 if (i->src(0).isIndirect(0)) // TODO, ugly 2602 return false; 2603 i->op = OP_MOV; 2604 i->subOp = NV50_IR_SUBOP_MOV_FINAL; 2605 i->src(0).set(i->src(1)); 2606 i->setSrc(1, NULL); 2607 i->setDef(0, new_LValue(func, FILE_GPR)); 2608 i->getDef(0)->reg.data.id = id; 2609 2610 prog->maxGPR = MAX2(prog->maxGPR, id); 2611 } else 2612 if (prog->getType() == Program::TYPE_GEOMETRY) { 2613 i->setIndirect(0, 1, gpEmitAddress); 2614 } 2615 return true; 2616 } 2617 2618 bool 2619 NVC0LoweringPass::handleOUT(Instruction *i) 2620 { 2621 Instruction *prev = i->prev; 2622 ImmediateValue stream, prevStream; 2623 2624 // Only merge if the stream ids match. Also, note that the previous 2625 // instruction would have already been lowered, so we take arg1 from it. 2626 if (i->op == OP_RESTART && prev && prev->op == OP_EMIT && 2627 i->src(0).getImmediate(stream) && 2628 prev->src(1).getImmediate(prevStream) && 2629 stream.reg.data.u32 == prevStream.reg.data.u32) { 2630 i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART; 2631 delete_Instruction(prog, i); 2632 } else { 2633 assert(gpEmitAddress); 2634 i->setDef(0, gpEmitAddress); 2635 i->setSrc(1, i->getSrc(0)); 2636 i->setSrc(0, gpEmitAddress); 2637 } 2638 return true; 2639 } 2640 2641 // Generate a binary predicate if an instruction is predicated by 2642 // e.g. an f32 value. 2643 void 2644 NVC0LoweringPass::checkPredicate(Instruction *insn) 2645 { 2646 Value *pred = insn->getPredicate(); 2647 Value *pdst; 2648 2649 if (!pred || pred->reg.file == FILE_PREDICATE) 2650 return; 2651 pdst = new_LValue(func, FILE_PREDICATE); 2652 2653 // CAUTION: don't use pdst->getInsn, the definition might not be unique, 2654 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass 2655 2656 bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred); 2657 2658 insn->setPredicate(insn->cc, pdst); 2659 } 2660 2661 // 2662 // - add quadop dance for texturing 2663 // - put FP outputs in GPRs 2664 // - convert instruction sequences 2665 // 2666 bool 2667 NVC0LoweringPass::visit(Instruction *i) 2668 { 2669 bool ret = true; 2670 bld.setPosition(i, false); 2671 2672 if (i->cc != CC_ALWAYS) 2673 checkPredicate(i); 2674 2675 switch (i->op) { 2676 case OP_TEX: 2677 case OP_TXB: 2678 case OP_TXL: 2679 case OP_TXF: 2680 case OP_TXG: 2681 return handleTEX(i->asTex()); 2682 case OP_TXD: 2683 return handleTXD(i->asTex()); 2684 case OP_TXLQ: 2685 return handleTXLQ(i->asTex()); 2686 case OP_TXQ: 2687 return handleTXQ(i->asTex()); 2688 case OP_EX2: 2689 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); 2690 i->setSrc(0, i->getDef(0)); 2691 break; 2692 case OP_POW: 2693 return handlePOW(i); 2694 case OP_DIV: 2695 return handleDIV(i); 2696 case OP_MOD: 2697 return handleMOD(i); 2698 case OP_SQRT: 2699 return handleSQRT(i); 2700 case OP_EXPORT: 2701 ret = handleEXPORT(i); 2702 break; 2703 case OP_EMIT: 2704 case OP_RESTART: 2705 return handleOUT(i); 2706 case OP_RDSV: 2707 return handleRDSV(i); 2708 case OP_WRSV: 2709 return handleWRSV(i); 2710 case OP_STORE: 2711 case OP_LOAD: 2712 handleLDST(i); 2713 break; 2714 case OP_ATOM: 2715 { 2716 const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER; 2717 handleATOM(i); 2718 handleCasExch(i, cctl); 2719 } 2720 break; 2721 case OP_SULDB: 2722 case OP_SULDP: 2723 case OP_SUSTB: 2724 case OP_SUSTP: 2725 case OP_SUREDB: 2726 case OP_SUREDP: 2727 if (targ->getChipset() >= NVISA_GM107_CHIPSET) 2728 handleSurfaceOpGM107(i->asTex()); 2729 else if (targ->getChipset() >= NVISA_GK104_CHIPSET) 2730 handleSurfaceOpNVE4(i->asTex()); 2731 else 2732 handleSurfaceOpNVC0(i->asTex()); 2733 break; 2734 case OP_SUQ: 2735 handleSUQ(i->asTex()); 2736 break; 2737 case OP_BUFQ: 2738 handleBUFQ(i); 2739 break; 2740 default: 2741 break; 2742 } 2743 2744 /* Kepler+ has a special opcode to compute a new base address to be used 2745 * for indirect loads. 2746 * 2747 * Maxwell+ has an additional similar requirement for indirect 2748 * interpolation ops in frag shaders. 2749 */ 2750 bool doAfetch = false; 2751 if (targ->getChipset() >= NVISA_GK104_CHIPSET && 2752 !i->perPatch && 2753 (i->op == OP_VFETCH || i->op == OP_EXPORT) && 2754 i->src(0).isIndirect(0)) { 2755 doAfetch = true; 2756 } 2757 if (targ->getChipset() >= NVISA_GM107_CHIPSET && 2758 (i->op == OP_LINTERP || i->op == OP_PINTERP) && 2759 i->src(0).isIndirect(0)) { 2760 doAfetch = true; 2761 } 2762 2763 if (doAfetch) { 2764 Value *addr = cloneShallow(func, i->getSrc(0)); 2765 Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(), 2766 i->getSrc(0)); 2767 afetch->setIndirect(0, 0, i->getIndirect(0, 0)); 2768 addr->reg.data.offset = 0; 2769 i->setSrc(0, addr); 2770 i->setIndirect(0, 0, afetch->getDef(0)); 2771 } 2772 2773 return ret; 2774 } 2775 2776 bool 2777 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const 2778 { 2779 if (stage == CG_STAGE_PRE_SSA) { 2780 NVC0LoweringPass pass(prog); 2781 return pass.run(prog, false, true); 2782 } else 2783 if (stage == CG_STAGE_POST_RA) { 2784 NVC0LegalizePostRA pass(prog); 2785 return pass.run(prog, false, true); 2786 } else 2787 if (stage == CG_STAGE_SSA) { 2788 NVC0LegalizeSSA pass; 2789 return pass.run(prog, false, true); 2790 } 2791 return false; 2792 } 2793 2794 } // namespace nv50_ir 2795