Home | History | Annotate | Download | only in codegen
      1 /*
      2  * Copyright 2011 Christoph Bumiller
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "codegen/nv50_ir.h"
     24 #include "codegen/nv50_ir_build_util.h"
     25 
     26 #include "codegen/nv50_ir_target_nvc0.h"
     27 #include "codegen/nv50_ir_lowering_nvc0.h"
     28 
     29 #include <limits>
     30 
     31 namespace nv50_ir {
     32 
     33 #define QOP_ADD  0
     34 #define QOP_SUBR 1
     35 #define QOP_SUB  2
     36 #define QOP_MOV2 3
     37 
     38 //             UL UR LL LR
     39 #define QUADOP(q, r, s, t)                      \
     40    ((QOP_##q << 6) | (QOP_##r << 4) |           \
     41     (QOP_##s << 2) | (QOP_##t << 0))
     42 
     43 void
     44 NVC0LegalizeSSA::handleDIV(Instruction *i)
     45 {
     46    FlowInstruction *call;
     47    int builtin;
     48    Value *def[2];
     49 
     50    bld.setPosition(i, false);
     51    def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
     52    def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
     53    switch (i->dType) {
     54    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
     55    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
     56    default:
     57       return;
     58    }
     59    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
     60    bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
     61    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
     62    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
     63 
     64    call->fixed = 1;
     65    call->absolute = call->builtin = 1;
     66    call->target.builtin = builtin;
     67    delete_Instruction(prog, i);
     68 }
     69 
     70 void
     71 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
     72 {
     73    assert(i->dType == TYPE_F64);
     74    // There are instructions that will compute the high 32 bits of the 64-bit
     75    // float. We will just stick 0 in the bottom 32 bits.
     76 
     77    bld.setPosition(i, false);
     78 
     79    // 1. Take the source and it up.
     80    Value *src[2], *dst[2], *def = i->getDef(0);
     81    bld.mkSplit(src, 4, i->getSrc(0));
     82 
     83    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
     84    dst[0] = bld.loadImm(NULL, 0);
     85    dst[1] = bld.getSSA();
     86 
     87    // 3. The new version of the instruction takes the high 32 bits of the
     88    // source and outputs the high 32 bits of the destination.
     89    i->setSrc(0, src[1]);
     90    i->setDef(0, dst[1]);
     91    i->setType(TYPE_F32);
     92    i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
     93 
     94    // 4. Recombine the two dst pieces back into the original destination.
     95    bld.setPosition(i, true);
     96    bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
     97 }
     98 
     99 void
    100 NVC0LegalizeSSA::handleFTZ(Instruction *i)
    101 {
    102    // Only want to flush float inputs
    103    assert(i->sType == TYPE_F32);
    104 
    105    // If we're already flushing denorms (and NaN's) to zero, no need for this.
    106    if (i->dnz)
    107       return;
    108 
    109    // Only certain classes of operations can flush
    110    OpClass cls = prog->getTarget()->getOpClass(i->op);
    111    if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
    112        cls != OPCLASS_CONVERT)
    113       return;
    114 
    115    i->ftz = true;
    116 }
    117 
    118 void
    119 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
    120 {
    121    if (i->tex.levelZero)
    122       return;
    123 
    124    ImmediateValue lod;
    125 
    126    // The LOD argument comes right after the coordinates (before depth bias,
    127    // offsets, etc).
    128    int arg = i->tex.target.getArgCount();
    129 
    130    // SM30+ stores the indirect handle as a separate arg, which comes before
    131    // the LOD.
    132    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
    133        i->tex.rIndirectSrc >= 0)
    134       arg++;
    135    // SM20 stores indirect handle combined with array coordinate
    136    if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
    137        !i->tex.target.isArray() &&
    138        i->tex.rIndirectSrc >= 0)
    139       arg++;
    140 
    141    if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
    142       return;
    143 
    144    if (i->op == OP_TXL)
    145       i->op = OP_TEX;
    146    i->tex.levelZero = true;
    147    i->moveSources(arg + 1, -1);
    148 }
    149 
    150 bool
    151 NVC0LegalizeSSA::visit(Function *fn)
    152 {
    153    bld.setProgram(fn->getProgram());
    154    return true;
    155 }
    156 
    157 bool
    158 NVC0LegalizeSSA::visit(BasicBlock *bb)
    159 {
    160    Instruction *next;
    161    for (Instruction *i = bb->getEntry(); i; i = next) {
    162       next = i->next;
    163 
    164       if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
    165          handleFTZ(i);
    166 
    167       switch (i->op) {
    168       case OP_DIV:
    169       case OP_MOD:
    170          if (i->sType != TYPE_F32)
    171             handleDIV(i);
    172          break;
    173       case OP_RCP:
    174       case OP_RSQ:
    175          if (i->dType == TYPE_F64)
    176             handleRCPRSQ(i);
    177          break;
    178       case OP_TXL:
    179       case OP_TXF:
    180          handleTEXLOD(i->asTex());
    181          break;
    182       default:
    183          break;
    184       }
    185    }
    186    return true;
    187 }
    188 
    189 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
    190    : rZero(NULL),
    191      carry(NULL),
    192      pOne(NULL),
    193      needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
    194                 prog->getTarget()->getChipset() < 0x110)
    195 {
    196 }
    197 
    198 bool
    199 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
    200                                     const Instruction *early) const
    201 {
    202    if (early->bb == later->bb)
    203       return early->serial < later->serial;
    204    return later->bb->dominatedBy(early->bb);
    205 }
    206 
    207 void
    208 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
    209                               Instruction *usei, const Instruction *texi)
    210 {
    211    bool add = true;
    212    bool dominated = insnDominatedBy(usei, texi);
    213    // Uses before the tex have to all be included. Just because an earlier
    214    // instruction dominates another instruction doesn't mean that there's no
    215    // way to get from the tex to the later instruction. For example you could
    216    // have nested loops, with the tex in the inner loop, and uses before it in
    217    // both loops - even though the outer loop's instruction would dominate the
    218    // inner's, we still want a texbar before the inner loop's instruction.
    219    //
    220    // However we can still use the eliding logic between uses dominated by the
    221    // tex instruction, as that is unambiguously correct.
    222    if (dominated) {
    223       for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
    224          if (it->after) {
    225             if (insnDominatedBy(usei, it->insn)) {
    226                add = false;
    227                break;
    228             }
    229             if (insnDominatedBy(it->insn, usei)) {
    230                it = uses.erase(it);
    231                continue;
    232             }
    233          }
    234          ++it;
    235       }
    236    }
    237    if (add)
    238       uses.push_back(TexUse(usei, texi, dominated));
    239 }
    240 
    241 // While it might be tempting to use the an algorithm that just looks at tex
    242 // uses, not all texture results are guaranteed to be used on all paths. In
    243 // the case where along some control flow path a texture result is never used,
    244 // we might reuse that register for something else, creating a
    245 // write-after-write hazard. So we have to manually look through all
    246 // instructions looking for ones that reference the registers in question.
    247 void
    248 NVC0LegalizePostRA::findFirstUses(
    249    Instruction *texi, std::list<TexUse> &uses)
    250 {
    251    int minGPR = texi->def(0).rep()->reg.data.id;
    252    int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
    253 
    254    unordered_set<const BasicBlock *> visited;
    255    findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
    256 }
    257 
    258 void
    259 NVC0LegalizePostRA::findFirstUsesBB(
    260    int minGPR, int maxGPR, Instruction *start,
    261    const Instruction *texi, std::list<TexUse> &uses,
    262    unordered_set<const BasicBlock *> &visited)
    263 {
    264    const BasicBlock *bb = start->bb;
    265 
    266    // We don't process the whole bb the first time around. This is correct,
    267    // however we might be in a loop and hit this BB again, and need to process
    268    // the full thing. So only mark a bb as visited if we processed it from the
    269    // beginning.
    270    if (start == bb->getEntry()) {
    271       if (visited.find(bb) != visited.end())
    272          return;
    273       visited.insert(bb);
    274    }
    275 
    276    for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
    277       if (insn->isNop())
    278          continue;
    279 
    280       for (int d = 0; insn->defExists(d); ++d) {
    281          const Value *def = insn->def(d).rep();
    282          if (insn->def(d).getFile() != FILE_GPR ||
    283              def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
    284              def->reg.data.id > maxGPR)
    285             continue;
    286          addTexUse(uses, insn, texi);
    287          return;
    288       }
    289 
    290       for (int s = 0; insn->srcExists(s); ++s) {
    291          const Value *src = insn->src(s).rep();
    292          if (insn->src(s).getFile() != FILE_GPR ||
    293              src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
    294              src->reg.data.id > maxGPR)
    295             continue;
    296          addTexUse(uses, insn, texi);
    297          return;
    298       }
    299    }
    300 
    301    for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
    302       findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
    303                       texi, uses, visited);
    304    }
    305 }
    306 
    307 // Texture barriers:
    308 // This pass is a bit long and ugly and can probably be optimized.
    309 //
    310 // 1. obtain a list of TEXes and their outputs' first use(s)
    311 // 2. calculate the barrier level of each first use (minimal number of TEXes,
    312 //    over all paths, between the TEX and the use in question)
    313 // 3. for each barrier, if all paths from the source TEX to that barrier
    314 //    contain a barrier of lesser level, it can be culled
    315 bool
    316 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
    317 {
    318    std::list<TexUse> *uses;
    319    std::vector<Instruction *> texes;
    320    std::vector<int> bbFirstTex;
    321    std::vector<int> bbFirstUse;
    322    std::vector<int> texCounts;
    323    std::vector<TexUse> useVec;
    324    ArrayList insns;
    325 
    326    fn->orderInstructions(insns);
    327 
    328    texCounts.resize(fn->allBBlocks.getSize(), 0);
    329    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
    330    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
    331 
    332    // tag BB CFG nodes by their id for later
    333    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
    334       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
    335       if (bb)
    336          bb->cfg.tag = bb->getId();
    337    }
    338 
    339    // gather the first uses for each TEX
    340    for (int i = 0; i < insns.getSize(); ++i) {
    341       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
    342       if (isTextureOp(tex->op)) {
    343          texes.push_back(tex);
    344          if (!texCounts.at(tex->bb->getId()))
    345             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
    346          texCounts[tex->bb->getId()]++;
    347       }
    348    }
    349    insns.clear();
    350    if (texes.empty())
    351       return false;
    352    uses = new std::list<TexUse>[texes.size()];
    353    if (!uses)
    354       return false;
    355    for (size_t i = 0; i < texes.size(); ++i) {
    356       findFirstUses(texes[i], uses[i]);
    357    }
    358 
    359    // determine the barrier level at each use
    360    for (size_t i = 0; i < texes.size(); ++i) {
    361       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
    362            ++u) {
    363          BasicBlock *tb = texes[i]->bb;
    364          BasicBlock *ub = u->insn->bb;
    365          if (tb == ub) {
    366             u->level = 0;
    367             for (size_t j = i + 1; j < texes.size() &&
    368                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
    369                  ++j)
    370                u->level++;
    371          } else {
    372             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
    373                                                       &ub->cfg, texCounts);
    374             if (u->level < 0) {
    375                WARN("Failed to find path TEX -> TEXBAR\n");
    376                u->level = 0;
    377                continue;
    378             }
    379             // this counted all TEXes in the origin block, correct that
    380             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
    381             // and did not count the TEXes in the destination block, add those
    382             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
    383                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
    384                  ++j)
    385                u->level++;
    386          }
    387          assert(u->level >= 0);
    388          useVec.push_back(*u);
    389       }
    390    }
    391    delete[] uses;
    392 
    393    // insert the barriers
    394    for (size_t i = 0; i < useVec.size(); ++i) {
    395       Instruction *prev = useVec[i].insn->prev;
    396       if (useVec[i].level < 0)
    397          continue;
    398       if (prev && prev->op == OP_TEXBAR) {
    399          if (prev->subOp > useVec[i].level)
    400             prev->subOp = useVec[i].level;
    401          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
    402       } else {
    403          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
    404          bar->fixed = 1;
    405          bar->subOp = useVec[i].level;
    406          // make use explicit to ease latency calculation
    407          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
    408          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
    409       }
    410    }
    411 
    412    if (fn->getProgram()->optLevel < 3)
    413       return true;
    414 
    415    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
    416 
    417    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
    418    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
    419    limitS.resize(fn->allBBlocks.getSize());
    420 
    421    // cull unneeded barriers (should do that earlier, but for simplicity)
    422    IteratorRef bi = fn->cfg.iteratorCFG();
    423    // first calculate min/max outstanding TEXes for each BB
    424    for (bi->reset(); !bi->end(); bi->next()) {
    425       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
    426       BasicBlock *bb = BasicBlock::get(n);
    427       int min = 0;
    428       int max = std::numeric_limits<int>::max();
    429       for (Instruction *i = bb->getFirst(); i; i = i->next) {
    430          if (isTextureOp(i->op)) {
    431             min++;
    432             if (max < std::numeric_limits<int>::max())
    433                max++;
    434          } else
    435          if (i->op == OP_TEXBAR) {
    436             min = MIN2(min, i->subOp);
    437             max = MIN2(max, i->subOp);
    438          }
    439       }
    440       // limits when looking at an isolated block
    441       limitS[bb->getId()].min = min;
    442       limitS[bb->getId()].max = max;
    443    }
    444    // propagate the min/max values
    445    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
    446       for (bi->reset(); !bi->end(); bi->next()) {
    447          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
    448          BasicBlock *bb = BasicBlock::get(n);
    449          const int bbId = bb->getId();
    450          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
    451             BasicBlock *in = BasicBlock::get(ei.getNode());
    452             const int inId = in->getId();
    453             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
    454             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
    455          }
    456          // I just hope this is correct ...
    457          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
    458             // no barrier
    459             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
    460             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
    461          } else {
    462             // block contained a barrier
    463             limitB[bbId].min = MIN2(limitS[bbId].max,
    464                                     limitT[bbId].min + limitS[bbId].min);
    465             limitB[bbId].max = MIN2(limitS[bbId].max,
    466                                     limitT[bbId].max + limitS[bbId].min);
    467          }
    468       }
    469    }
    470    // finally delete unnecessary barriers
    471    for (bi->reset(); !bi->end(); bi->next()) {
    472       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
    473       BasicBlock *bb = BasicBlock::get(n);
    474       Instruction *prev = NULL;
    475       Instruction *next;
    476       int max = limitT[bb->getId()].max;
    477       for (Instruction *i = bb->getFirst(); i; i = next) {
    478          next = i->next;
    479          if (i->op == OP_TEXBAR) {
    480             if (i->subOp >= max) {
    481                delete_Instruction(prog, i);
    482                i = NULL;
    483             } else {
    484                max = i->subOp;
    485                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
    486                   delete_Instruction(prog, prev);
    487                   prev = NULL;
    488                }
    489             }
    490          } else
    491          if (isTextureOp(i->op)) {
    492             max++;
    493          }
    494          if (i && !i->isNop())
    495             prev = i;
    496       }
    497    }
    498    return true;
    499 }
    500 
    501 bool
    502 NVC0LegalizePostRA::visit(Function *fn)
    503 {
    504    if (needTexBar)
    505       insertTextureBarriers(fn);
    506 
    507    rZero = new_LValue(fn, FILE_GPR);
    508    pOne = new_LValue(fn, FILE_PREDICATE);
    509    carry = new_LValue(fn, FILE_FLAGS);
    510 
    511    rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
    512    carry->reg.data.id = 0;
    513    pOne->reg.data.id = 7;
    514 
    515    return true;
    516 }
    517 
    518 void
    519 NVC0LegalizePostRA::replaceZero(Instruction *i)
    520 {
    521    for (int s = 0; i->srcExists(s); ++s) {
    522       if (s == 2 && i->op == OP_SUCLAMP)
    523          continue;
    524       ImmediateValue *imm = i->getSrc(s)->asImm();
    525       if (imm) {
    526          if (i->op == OP_SELP && s == 2) {
    527             i->setSrc(s, pOne);
    528             if (imm->reg.data.u64 == 0)
    529                i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
    530          } else if (imm->reg.data.u64 == 0) {
    531             i->setSrc(s, rZero);
    532          }
    533       }
    534    }
    535 }
    536 
    537 // replace CONT with BRA for single unconditional continue
    538 bool
    539 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
    540 {
    541    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
    542       return false;
    543    Graph::EdgeIterator ei = bb->cfg.incident();
    544    if (ei.getType() != Graph::Edge::BACK)
    545       ei.next();
    546    if (ei.getType() != Graph::Edge::BACK)
    547       return false;
    548    BasicBlock *contBB = BasicBlock::get(ei.getNode());
    549 
    550    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
    551        contBB->getExit()->getPredicate())
    552       return false;
    553    contBB->getExit()->op = OP_BRA;
    554    bb->remove(bb->getEntry()); // delete PRECONT
    555 
    556    ei.next();
    557    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
    558    return true;
    559 }
    560 
    561 // replace branches to join blocks with join ops
    562 void
    563 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
    564 {
    565    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
    566       return;
    567    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
    568       BasicBlock *in = BasicBlock::get(ei.getNode());
    569       Instruction *exit = in->getExit();
    570       if (!exit) {
    571          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
    572          // there should always be a terminator instruction
    573          WARN("inserted missing terminator in BB:%i\n", in->getId());
    574       } else
    575       if (exit->op == OP_BRA) {
    576          exit->op = OP_JOIN;
    577          exit->asFlow()->limit = 1; // must-not-propagate marker
    578       }
    579    }
    580    bb->remove(bb->getEntry());
    581 }
    582 
    583 bool
    584 NVC0LegalizePostRA::visit(BasicBlock *bb)
    585 {
    586    Instruction *i, *next;
    587 
    588    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
    589    for (i = bb->getFirst(); i; i = next) {
    590       next = i->next;
    591       if (i->op == OP_EMIT || i->op == OP_RESTART) {
    592          if (!i->getDef(0)->refCount())
    593             i->setDef(0, NULL);
    594          if (i->src(0).getFile() == FILE_IMMEDIATE)
    595             i->setSrc(0, rZero); // initial value must be 0
    596          replaceZero(i);
    597       } else
    598       if (i->isNop()) {
    599          bb->remove(i);
    600       } else
    601       if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
    602           prog->getType() != Program::TYPE_COMPUTE) {
    603          // It seems like barriers are never required for tessellation since
    604          // the warp size is 32, and there are always at most 32 tcs threads.
    605          bb->remove(i);
    606       } else
    607       if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
    608          int offset = i->src(0).get()->reg.data.offset;
    609          if (abs(offset) > 0x10000)
    610             i->src(0).get()->reg.fileIndex += offset >> 16;
    611          i->src(0).get()->reg.data.offset = (int)(short)offset;
    612       } else {
    613          // TODO: Move this to before register allocation for operations that
    614          // need the $c register !
    615          if (typeSizeof(i->dType) == 8) {
    616             Instruction *hi;
    617             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
    618             if (hi)
    619                next = hi;
    620          }
    621 
    622          if (i->op != OP_MOV && i->op != OP_PFETCH)
    623             replaceZero(i);
    624       }
    625    }
    626    if (!bb->getEntry())
    627       return true;
    628 
    629    if (!tryReplaceContWithBra(bb))
    630       propagateJoin(bb);
    631 
    632    return true;
    633 }
    634 
    635 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
    636 {
    637    bld.setProgram(prog);
    638 }
    639 
    640 bool
    641 NVC0LoweringPass::visit(Function *fn)
    642 {
    643    if (prog->getType() == Program::TYPE_GEOMETRY) {
    644       assert(!strncmp(fn->getName(), "MAIN", 4));
    645       // TODO: when we generate actual functions pass this value along somehow
    646       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
    647       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
    648       if (fn->cfgExit) {
    649          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
    650          bld.mkMovToReg(0, gpEmitAddress);
    651       }
    652    }
    653    return true;
    654 }
    655 
    656 bool
    657 NVC0LoweringPass::visit(BasicBlock *bb)
    658 {
    659    return true;
    660 }
    661 
    662 inline Value *
    663 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
    664 {
    665    uint8_t b = prog->driver->io.auxCBSlot;
    666    uint32_t off = prog->driver->io.texBindBase + slot * 4;
    667 
    668    if (ptr)
    669       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
    670 
    671    return bld.
    672       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
    673 }
    674 
    675 // move array source to first slot, convert to u16, add indirections
    676 bool
    677 NVC0LoweringPass::handleTEX(TexInstruction *i)
    678 {
    679    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    680    const int arg = i->tex.target.getArgCount();
    681    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
    682    const int chipset = prog->getTarget()->getChipset();
    683 
    684    /* Only normalize in the non-explicit derivatives case. For explicit
    685     * derivatives, this is handled in handleManualTXD.
    686     */
    687    if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
    688       Value *src[3], *val;
    689       int c;
    690       for (c = 0; c < 3; ++c)
    691          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
    692       val = bld.getScratch();
    693       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
    694       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
    695       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
    696       for (c = 0; c < 3; ++c) {
    697          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
    698                                  i->getSrc(c), val));
    699       }
    700    }
    701 
    702    // Arguments to the TEX instruction are a little insane. Even though the
    703    // encoding is identical between SM20 and SM30, the arguments mean
    704    // different things between Fermi and Kepler+. A lot of arguments are
    705    // optional based on flags passed to the instruction. This summarizes the
    706    // order of things.
    707    //
    708    // Fermi:
    709    //  array/indirect
    710    //  coords
    711    //  sample
    712    //  lod bias
    713    //  depth compare
    714    //  offsets:
    715    //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
    716    //    - other: 4 bits each, single reg
    717    //
    718    // Kepler+:
    719    //  indirect handle
    720    //  array (+ offsets for txd in upper 16 bits)
    721    //  coords
    722    //  sample
    723    //  lod bias
    724    //  depth compare
    725    //  offsets (same as fermi, except txd which takes it with array)
    726    //
    727    // Maxwell (tex):
    728    //  array
    729    //  coords
    730    //  indirect handle
    731    //  sample
    732    //  lod bias
    733    //  depth compare
    734    //  offsets
    735    //
    736    // Maxwell (txd):
    737    //  indirect handle
    738    //  coords
    739    //  array + offsets
    740    //  derivatives
    741 
    742    if (chipset >= NVISA_GK104_CHIPSET) {
    743       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
    744          // XXX this ignores tsc, and assumes a 1:1 mapping
    745          assert(i->tex.rIndirectSrc >= 0);
    746          Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
    747          i->tex.r = 0xff;
    748          i->tex.s = 0x1f;
    749          i->setIndirectR(hnd);
    750          i->setIndirectS(NULL);
    751       } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
    752          if (i->tex.r == 0xffff)
    753             i->tex.r = prog->driver->io.fbtexBindBase / 4;
    754          else
    755             i->tex.r += prog->driver->io.texBindBase / 4;
    756          i->tex.s  = 0; // only a single cX[] value possible here
    757       } else {
    758          Value *hnd = bld.getScratch();
    759          Value *rHnd = loadTexHandle(NULL, i->tex.r);
    760          Value *sHnd = loadTexHandle(NULL, i->tex.s);
    761 
    762          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
    763 
    764          i->tex.r = 0; // not used for indirect tex
    765          i->tex.s = 0;
    766          i->setIndirectR(hnd);
    767       }
    768       if (i->tex.target.isArray()) {
    769          LValue *layer = new_LValue(func, FILE_GPR);
    770          Value *src = i->getSrc(lyr);
    771          const int sat = (i->op == OP_TXF) ? 1 : 0;
    772          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
    773          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
    774          if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
    775             for (int s = dim; s >= 1; --s)
    776                i->setSrc(s, i->getSrc(s - 1));
    777             i->setSrc(0, layer);
    778          } else {
    779             i->setSrc(dim, layer);
    780          }
    781       }
    782       // Move the indirect reference to the first place
    783       if (i->tex.rIndirectSrc >= 0 && (
    784                 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
    785          Value *hnd = i->getIndirectR();
    786 
    787          i->setIndirectR(NULL);
    788          i->moveSources(0, 1);
    789          i->setSrc(0, hnd);
    790          i->tex.rIndirectSrc = 0;
    791          i->tex.sIndirectSrc = -1;
    792       }
    793       // Move the indirect reference to right after the coords
    794       else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
    795          Value *hnd = i->getIndirectR();
    796 
    797          i->setIndirectR(NULL);
    798          i->moveSources(arg, 1);
    799          i->setSrc(arg, hnd);
    800          i->tex.rIndirectSrc = 0;
    801          i->tex.sIndirectSrc = -1;
    802       }
    803    } else
    804    // (nvc0) generate and move the tsc/tic/array source to the front
    805    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
    806       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
    807 
    808       Value *ticRel = i->getIndirectR();
    809       Value *tscRel = i->getIndirectS();
    810 
    811       if (i->tex.r == 0xffff) {
    812          i->tex.r = 0x20;
    813          i->tex.s = 0x10;
    814       }
    815 
    816       if (ticRel) {
    817          i->setSrc(i->tex.rIndirectSrc, NULL);
    818          if (i->tex.r)
    819             ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
    820                                 ticRel, bld.mkImm(i->tex.r));
    821       }
    822       if (tscRel) {
    823          i->setSrc(i->tex.sIndirectSrc, NULL);
    824          if (i->tex.s)
    825             tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
    826                                 tscRel, bld.mkImm(i->tex.s));
    827       }
    828 
    829       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
    830       if (arrayIndex) {
    831          for (int s = dim; s >= 1; --s)
    832             i->setSrc(s, i->getSrc(s - 1));
    833          i->setSrc(0, arrayIndex);
    834       } else {
    835          i->moveSources(0, 1);
    836       }
    837 
    838       if (arrayIndex) {
    839          int sat = (i->op == OP_TXF) ? 1 : 0;
    840          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
    841          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
    842       } else {
    843          bld.loadImm(src, 0);
    844       }
    845 
    846       if (ticRel)
    847          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
    848       if (tscRel)
    849          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
    850 
    851       i->setSrc(0, src);
    852    }
    853 
    854    // For nvc0, the sample id has to be in the second operand, as the offset
    855    // does. Right now we don't know how to pass both in, and this case can't
    856    // happen with OpenGL. On nve0, the sample id is part of the texture
    857    // coordinate argument.
    858    assert(chipset >= NVISA_GK104_CHIPSET ||
    859           !i->tex.useOffsets || !i->tex.target.isMS());
    860 
    861    // offset is between lod and dc
    862    if (i->tex.useOffsets) {
    863       int n, c;
    864       int s = i->srcCount(0xff, true);
    865       if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
    866          if (i->tex.target.isShadow())
    867             s--;
    868          if (i->srcExists(s)) // move potential predicate out of the way
    869             i->moveSources(s, 1);
    870          if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
    871             i->moveSources(s + 1, 1);
    872       }
    873       if (i->op == OP_TXG) {
    874          // Either there is 1 offset, which goes into the 2 low bytes of the
    875          // first source, or there are 4 offsets, which go into 2 sources (8
    876          // values, 1 byte each).
    877          Value *offs[2] = {NULL, NULL};
    878          for (n = 0; n < i->tex.useOffsets; n++) {
    879             for (c = 0; c < 2; ++c) {
    880                if ((n % 2) == 0 && c == 0)
    881                   bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
    882                else
    883                   bld.mkOp3(OP_INSBF, TYPE_U32,
    884                             offs[n / 2],
    885                             i->offset[n][c].get(),
    886                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
    887                             offs[n / 2]);
    888             }
    889          }
    890          i->setSrc(s, offs[0]);
    891          if (offs[1])
    892             i->setSrc(s + 1, offs[1]);
    893       } else {
    894          unsigned imm = 0;
    895          assert(i->tex.useOffsets == 1);
    896          for (c = 0; c < 3; ++c) {
    897             ImmediateValue val;
    898             if (!i->offset[0][c].getImmediate(val))
    899                assert(!"non-immediate offset passed to non-TXG");
    900             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
    901          }
    902          if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
    903             // The offset goes into the upper 16 bits of the array index. So
    904             // create it if it's not already there, and INSBF it if it already
    905             // is.
    906             s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
    907             if (chipset >= NVISA_GM107_CHIPSET)
    908                s += dim;
    909             if (i->tex.target.isArray()) {
    910                bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
    911                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
    912                          i->getSrc(s));
    913             } else {
    914                i->moveSources(s, 1);
    915                i->setSrc(s, bld.loadImm(NULL, imm << 16));
    916             }
    917          } else {
    918             i->setSrc(s, bld.loadImm(NULL, imm));
    919          }
    920       }
    921    }
    922 
    923    if (chipset >= NVISA_GK104_CHIPSET) {
    924       //
    925       // If TEX requires more than 4 sources, the 2nd register tuple must be
    926       // aligned to 4, even if it consists of just a single 4-byte register.
    927       //
    928       // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
    929       //
    930       int s = i->srcCount(0xff, true);
    931       if (s > 4 && s < 7) {
    932          if (i->srcExists(s)) // move potential predicate out of the way
    933             i->moveSources(s, 7 - s);
    934          while (s < 7)
    935             i->setSrc(s++, bld.loadImm(NULL, 0));
    936       }
    937    }
    938 
    939    return true;
    940 }
    941 
    942 bool
    943 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
    944 {
    945    static const uint8_t qOps[4][2] =
    946    {
    947       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
    948       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
    949       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
    950       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
    951    };
    952    Value *def[4][4];
    953    Value *crd[3];
    954    Instruction *tex;
    955    Value *zero = bld.loadImm(bld.getSSA(), 0);
    956    int l, c;
    957    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    958 
    959    // This function is invoked after handleTEX lowering, so we have to expect
    960    // the arguments in the order that the hw wants them. For Fermi, array and
    961    // indirect are both in the leading arg, while for Kepler, array and
    962    // indirect are separate (and both precede the coordinates). Maxwell is
    963    // handled in a separate function.
    964    unsigned array;
    965    if (targ->getChipset() < NVISA_GK104_CHIPSET)
    966       array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
    967    else
    968       array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
    969 
    970    i->op = OP_TEX; // no need to clone dPdx/dPdy later
    971 
    972    for (c = 0; c < dim; ++c)
    973       crd[c] = bld.getScratch();
    974 
    975    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
    976    for (l = 0; l < 4; ++l) {
    977       Value *src[3], *val;
    978       // mov coordinates from lane l to all lanes
    979       for (c = 0; c < dim; ++c)
    980          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
    981       // add dPdx from lane l to lanes dx
    982       for (c = 0; c < dim; ++c)
    983          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
    984       // add dPdy from lane l to lanes dy
    985       for (c = 0; c < dim; ++c)
    986          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
    987       // normalize cube coordinates
    988       if (i->tex.target.isCube()) {
    989          for (c = 0; c < 3; ++c)
    990             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
    991          val = bld.getScratch();
    992          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
    993          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
    994          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
    995          for (c = 0; c < 3; ++c)
    996             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
    997       } else {
    998          for (c = 0; c < dim; ++c)
    999             src[c] = crd[c];
   1000       }
   1001       // texture
   1002       bld.insert(tex = cloneForward(func, i));
   1003       for (c = 0; c < dim; ++c)
   1004          tex->setSrc(c + array, src[c]);
   1005       // save results
   1006       for (c = 0; i->defExists(c); ++c) {
   1007          Instruction *mov;
   1008          def[c][l] = bld.getSSA();
   1009          mov = bld.mkMov(def[c][l], tex->getDef(c));
   1010          mov->fixed = 1;
   1011          mov->lanes = 1 << l;
   1012       }
   1013    }
   1014    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
   1015 
   1016    for (c = 0; i->defExists(c); ++c) {
   1017       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
   1018       for (l = 0; l < 4; ++l)
   1019          u->setSrc(l, def[c][l]);
   1020    }
   1021 
   1022    i->bb->remove(i);
   1023    return true;
   1024 }
   1025 
   1026 bool
   1027 NVC0LoweringPass::handleTXD(TexInstruction *txd)
   1028 {
   1029    int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
   1030    unsigned arg = txd->tex.target.getArgCount();
   1031    unsigned expected_args = arg;
   1032    const int chipset = prog->getTarget()->getChipset();
   1033 
   1034    if (chipset >= NVISA_GK104_CHIPSET) {
   1035       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
   1036          expected_args++;
   1037       if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
   1038          expected_args++;
   1039    } else {
   1040       if (txd->tex.useOffsets)
   1041          expected_args++;
   1042       if (!txd->tex.target.isArray() && (
   1043                 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
   1044          expected_args++;
   1045    }
   1046 
   1047    if (expected_args > 4 ||
   1048        dim > 2 ||
   1049        txd->tex.target.isShadow())
   1050       txd->op = OP_TEX;
   1051 
   1052    handleTEX(txd);
   1053    while (txd->srcExists(arg))
   1054       ++arg;
   1055 
   1056    txd->tex.derivAll = true;
   1057    if (txd->op == OP_TEX)
   1058       return handleManualTXD(txd);
   1059 
   1060    assert(arg == expected_args);
   1061    for (int c = 0; c < dim; ++c) {
   1062       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
   1063       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
   1064       txd->dPdx[c].set(NULL);
   1065       txd->dPdy[c].set(NULL);
   1066    }
   1067 
   1068    // In this case we have fewer than 4 "real" arguments, which means that
   1069    // handleTEX didn't apply any padding. However we have to make sure that
   1070    // the second "group" of arguments still gets padded up to 4.
   1071    if (chipset >= NVISA_GK104_CHIPSET) {
   1072       int s = arg + 2 * dim;
   1073       if (s >= 4 && s < 7) {
   1074          if (txd->srcExists(s)) // move potential predicate out of the way
   1075             txd->moveSources(s, 7 - s);
   1076          while (s < 7)
   1077             txd->setSrc(s++, bld.loadImm(NULL, 0));
   1078       }
   1079    }
   1080 
   1081    return true;
   1082 }
   1083 
   1084 bool
   1085 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
   1086 {
   1087    const int chipset = prog->getTarget()->getChipset();
   1088    if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
   1089       txq->tex.r += prog->driver->io.texBindBase / 4;
   1090 
   1091    if (txq->tex.rIndirectSrc < 0)
   1092       return true;
   1093 
   1094    Value *ticRel = txq->getIndirectR();
   1095 
   1096    txq->setIndirectS(NULL);
   1097    txq->tex.sIndirectSrc = -1;
   1098 
   1099    assert(ticRel);
   1100 
   1101    if (chipset < NVISA_GK104_CHIPSET) {
   1102       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
   1103 
   1104       txq->setSrc(txq->tex.rIndirectSrc, NULL);
   1105       if (txq->tex.r)
   1106          ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
   1107                              ticRel, bld.mkImm(txq->tex.r));
   1108 
   1109       bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
   1110 
   1111       txq->moveSources(0, 1);
   1112       txq->setSrc(0, src);
   1113    } else {
   1114       Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
   1115       txq->tex.r = 0xff;
   1116       txq->tex.s = 0x1f;
   1117 
   1118       txq->setIndirectR(NULL);
   1119       txq->moveSources(0, 1);
   1120       txq->setSrc(0, hnd);
   1121       txq->tex.rIndirectSrc = 0;
   1122    }
   1123 
   1124    return true;
   1125 }
   1126 
   1127 bool
   1128 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
   1129 {
   1130    /* The outputs are inverted compared to what the TGSI instruction
   1131     * expects. Take that into account in the mask.
   1132     */
   1133    assert((i->tex.mask & ~3) == 0);
   1134    if (i->tex.mask == 1)
   1135       i->tex.mask = 2;
   1136    else if (i->tex.mask == 2)
   1137       i->tex.mask = 1;
   1138    handleTEX(i);
   1139    bld.setPosition(i, true);
   1140 
   1141    /* The returned values are not quite what we want:
   1142     * (a) convert from s16/u16 to f32
   1143     * (b) multiply by 1/256
   1144     */
   1145    for (int def = 0; def < 2; ++def) {
   1146       if (!i->defExists(def))
   1147          continue;
   1148       enum DataType type = TYPE_S16;
   1149       if (i->tex.mask == 2 || def > 0)
   1150          type = TYPE_U16;
   1151       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
   1152       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
   1153                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
   1154    }
   1155    if (i->tex.mask == 3) {
   1156       LValue *t = new_LValue(func, FILE_GPR);
   1157       bld.mkMov(t, i->getDef(0));
   1158       bld.mkMov(i->getDef(0), i->getDef(1));
   1159       bld.mkMov(i->getDef(1), t);
   1160    }
   1161    return true;
   1162 }
   1163 
   1164 bool
   1165 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
   1166 {
   1167    bufq->op = OP_MOV;
   1168    bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
   1169                                    bufq->getSrc(0)->reg.fileIndex * 16));
   1170    bufq->setIndirect(0, 0, NULL);
   1171    bufq->setIndirect(0, 1, NULL);
   1172    return true;
   1173 }
   1174 
   1175 void
   1176 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
   1177 {
   1178    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
   1179 
   1180    BasicBlock *currBB = atom->bb;
   1181    BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
   1182    BasicBlock *joinBB = atom->bb->splitAfter(atom);
   1183    BasicBlock *setAndUnlockBB = new BasicBlock(func);
   1184    BasicBlock *failLockBB = new BasicBlock(func);
   1185 
   1186    bld.setPosition(currBB, true);
   1187    assert(!currBB->joinAt);
   1188    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
   1189 
   1190    CmpInstruction *pred =
   1191       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   1192                 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
   1193 
   1194    bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
   1195    currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
   1196 
   1197    bld.setPosition(tryLockBB, true);
   1198 
   1199    Instruction *ld =
   1200       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
   1201                  atom->getIndirect(0, 0));
   1202    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
   1203    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
   1204 
   1205    bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
   1206    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
   1207    tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
   1208    tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
   1209 
   1210    tryLockBB->cfg.detach(&joinBB->cfg);
   1211    bld.remove(atom);
   1212 
   1213    bld.setPosition(setAndUnlockBB, true);
   1214    Value *stVal;
   1215    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
   1216       // Read the old value, and write the new one.
   1217       stVal = atom->getSrc(1);
   1218    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
   1219       CmpInstruction *set =
   1220          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
   1221                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
   1222 
   1223       bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
   1224                 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
   1225    } else {
   1226       operation op;
   1227 
   1228       switch (atom->subOp) {
   1229       case NV50_IR_SUBOP_ATOM_ADD:
   1230          op = OP_ADD;
   1231          break;
   1232       case NV50_IR_SUBOP_ATOM_AND:
   1233          op = OP_AND;
   1234          break;
   1235       case NV50_IR_SUBOP_ATOM_OR:
   1236          op = OP_OR;
   1237          break;
   1238       case NV50_IR_SUBOP_ATOM_XOR:
   1239          op = OP_XOR;
   1240          break;
   1241       case NV50_IR_SUBOP_ATOM_MIN:
   1242          op = OP_MIN;
   1243          break;
   1244       case NV50_IR_SUBOP_ATOM_MAX:
   1245          op = OP_MAX;
   1246          break;
   1247       default:
   1248          assert(0);
   1249          return;
   1250       }
   1251 
   1252       stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
   1253                          atom->getSrc(1));
   1254    }
   1255 
   1256    Instruction *st =
   1257       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
   1258                   atom->getIndirect(0, 0), stVal);
   1259    st->setDef(0, pred->getDef(0));
   1260    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
   1261 
   1262    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
   1263    setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
   1264 
   1265    // Lock until the store has not been performed.
   1266    bld.setPosition(failLockBB, true);
   1267    bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
   1268    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
   1269    failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
   1270    failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
   1271 
   1272    bld.setPosition(joinBB, false);
   1273    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
   1274 }
   1275 
   1276 void
   1277 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
   1278 {
   1279    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
   1280 
   1281    BasicBlock *currBB = atom->bb;
   1282    BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
   1283    BasicBlock *joinBB = atom->bb->splitAfter(atom);
   1284 
   1285    bld.setPosition(currBB, true);
   1286    assert(!currBB->joinAt);
   1287    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
   1288 
   1289    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
   1290    currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
   1291 
   1292    bld.setPosition(tryLockAndSetBB, true);
   1293 
   1294    Instruction *ld =
   1295       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
   1296                  atom->getIndirect(0, 0));
   1297    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
   1298    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
   1299 
   1300    Value *stVal;
   1301    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
   1302       // Read the old value, and write the new one.
   1303       stVal = atom->getSrc(1);
   1304    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
   1305       CmpInstruction *set =
   1306          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   1307                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
   1308       set->setPredicate(CC_P, ld->getDef(1));
   1309 
   1310       Instruction *selp =
   1311          bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
   1312                    atom->getSrc(2), set->getDef(0));
   1313       selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
   1314       selp->setPredicate(CC_P, ld->getDef(1));
   1315 
   1316       stVal = selp->getDef(0);
   1317    } else {
   1318       operation op;
   1319 
   1320       switch (atom->subOp) {
   1321       case NV50_IR_SUBOP_ATOM_ADD:
   1322          op = OP_ADD;
   1323          break;
   1324       case NV50_IR_SUBOP_ATOM_AND:
   1325          op = OP_AND;
   1326          break;
   1327       case NV50_IR_SUBOP_ATOM_OR:
   1328          op = OP_OR;
   1329          break;
   1330       case NV50_IR_SUBOP_ATOM_XOR:
   1331          op = OP_XOR;
   1332          break;
   1333       case NV50_IR_SUBOP_ATOM_MIN:
   1334          op = OP_MIN;
   1335          break;
   1336       case NV50_IR_SUBOP_ATOM_MAX:
   1337          op = OP_MAX;
   1338          break;
   1339       default:
   1340          assert(0);
   1341          return;
   1342       }
   1343 
   1344       Instruction *i =
   1345          bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
   1346                    atom->getSrc(1));
   1347       i->setPredicate(CC_P, ld->getDef(1));
   1348 
   1349       stVal = i->getDef(0);
   1350    }
   1351 
   1352    Instruction *st =
   1353       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
   1354                   atom->getIndirect(0, 0), stVal);
   1355    st->setPredicate(CC_P, ld->getDef(1));
   1356    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
   1357 
   1358    // Loop until the lock is acquired.
   1359    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
   1360    tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
   1361    tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
   1362    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
   1363 
   1364    bld.remove(atom);
   1365 
   1366    bld.setPosition(joinBB, false);
   1367    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
   1368 }
   1369 
   1370 bool
   1371 NVC0LoweringPass::handleATOM(Instruction *atom)
   1372 {
   1373    SVSemantic sv;
   1374    Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
   1375 
   1376    switch (atom->src(0).getFile()) {
   1377    case FILE_MEMORY_LOCAL:
   1378       sv = SV_LBASE;
   1379       break;
   1380    case FILE_MEMORY_SHARED:
   1381       // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
   1382       // operations on shared memory. For Maxwell, ATOMS is enough.
   1383       if (targ->getChipset() < NVISA_GK104_CHIPSET)
   1384          handleSharedATOM(atom);
   1385       else if (targ->getChipset() < NVISA_GM107_CHIPSET)
   1386          handleSharedATOMNVE4(atom);
   1387       return true;
   1388    default:
   1389       assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
   1390       base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
   1391       assert(base->reg.size == 8);
   1392       if (ptr)
   1393          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
   1394       assert(base->reg.size == 8);
   1395       atom->setIndirect(0, 0, base);
   1396       atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   1397 
   1398       // Harden against out-of-bounds accesses
   1399       Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
   1400       Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
   1401       Value *pred = new_LValue(func, FILE_PREDICATE);
   1402       if (ptr)
   1403          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
   1404       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
   1405       atom->setPredicate(CC_NOT_P, pred);
   1406       if (atom->defExists(0)) {
   1407          Value *zero, *dst = atom->getDef(0);
   1408          atom->setDef(0, bld.getSSA());
   1409 
   1410          bld.setPosition(atom, true);
   1411          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
   1412             ->setPredicate(CC_P, pred);
   1413          bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
   1414       }
   1415 
   1416       return true;
   1417    }
   1418    base =
   1419       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
   1420 
   1421    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
   1422    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   1423    if (ptr)
   1424       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
   1425    atom->setIndirect(0, 1, NULL);
   1426    atom->setIndirect(0, 0, base);
   1427 
   1428    return true;
   1429 }
   1430 
   1431 bool
   1432 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
   1433 {
   1434    if (targ->getChipset() < NVISA_GM107_CHIPSET) {
   1435       if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
   1436          // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
   1437          return false;
   1438       }
   1439    }
   1440 
   1441    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
   1442        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
   1443       return false;
   1444    bld.setPosition(cas, true);
   1445 
   1446    if (needCctl) {
   1447       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
   1448       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
   1449       cctl->fixed = 1;
   1450       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
   1451       if (cas->isPredicated())
   1452          cctl->setPredicate(cas->cc, cas->getPredicate());
   1453    }
   1454 
   1455    if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
   1456       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
   1457       // should be set to the high part of the double reg or bad things will
   1458       // happen elsewhere in the universe.
   1459       // Also, it sometimes returns the new value instead of the old one
   1460       // under mysterious circumstances.
   1461       Value *dreg = bld.getSSA(8);
   1462       bld.setPosition(cas, false);
   1463       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
   1464       cas->setSrc(1, dreg);
   1465       cas->setSrc(2, dreg);
   1466    }
   1467 
   1468    return true;
   1469 }
   1470 
   1471 inline Value *
   1472 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
   1473 {
   1474    uint8_t b = prog->driver->io.auxCBSlot;
   1475    off += base;
   1476 
   1477    return bld.
   1478       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
   1479 }
   1480 
   1481 inline Value *
   1482 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
   1483 {
   1484    uint8_t b = prog->driver->io.auxCBSlot;
   1485    off += base;
   1486 
   1487    if (ptr)
   1488       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
   1489 
   1490    return bld.
   1491       mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
   1492 }
   1493 
   1494 inline Value *
   1495 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
   1496 {
   1497    uint8_t b = prog->driver->io.auxCBSlot;
   1498    off += base;
   1499 
   1500    if (ptr)
   1501       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
   1502 
   1503    return bld.
   1504       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
   1505 }
   1506 
   1507 inline Value *
   1508 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
   1509 {
   1510    return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
   1511 }
   1512 
   1513 inline Value *
   1514 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
   1515 {
   1516    return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
   1517 }
   1518 
   1519 inline Value *
   1520 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
   1521 {
   1522    return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
   1523 }
   1524 
   1525 inline Value *
   1526 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
   1527 {
   1528    return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
   1529 }
   1530 
   1531 inline Value *
   1532 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
   1533 {
   1534    uint8_t b = prog->driver->io.msInfoCBSlot;
   1535    off += prog->driver->io.msInfoBase;
   1536    return bld.
   1537       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
   1538 }
   1539 
   1540 /* On nvc0, surface info is obtained via the surface binding points passed
   1541  * to the SULD/SUST instructions.
   1542  * On nve4, surface info is stored in c[] and is used by various special
   1543  * instructions, e.g. for clamping coordinates or generating an address.
   1544  * They couldn't just have added an equivalent to TIC now, couldn't they ?
   1545  */
   1546 #define NVC0_SU_INFO_ADDR   0x00
   1547 #define NVC0_SU_INFO_FMT    0x04
   1548 #define NVC0_SU_INFO_DIM_X  0x08
   1549 #define NVC0_SU_INFO_PITCH  0x0c
   1550 #define NVC0_SU_INFO_DIM_Y  0x10
   1551 #define NVC0_SU_INFO_ARRAY  0x14
   1552 #define NVC0_SU_INFO_DIM_Z  0x18
   1553 #define NVC0_SU_INFO_UNK1C  0x1c
   1554 #define NVC0_SU_INFO_WIDTH  0x20
   1555 #define NVC0_SU_INFO_HEIGHT 0x24
   1556 #define NVC0_SU_INFO_DEPTH  0x28
   1557 #define NVC0_SU_INFO_TARGET 0x2c
   1558 #define NVC0_SU_INFO_BSIZE  0x30
   1559 #define NVC0_SU_INFO_RAW_X  0x34
   1560 #define NVC0_SU_INFO_MS_X   0x38
   1561 #define NVC0_SU_INFO_MS_Y   0x3c
   1562 
   1563 #define NVC0_SU_INFO__STRIDE 0x40
   1564 
   1565 #define NVC0_SU_INFO_DIM(i)  (0x08 + (i) * 8)
   1566 #define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
   1567 #define NVC0_SU_INFO_MS(i)   (0x38 + (i) * 4)
   1568 
   1569 inline Value *
   1570 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off)
   1571 {
   1572    uint32_t base = slot * NVC0_SU_INFO__STRIDE;
   1573 
   1574    if (ptr) {
   1575       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
   1576       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
   1577       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
   1578       base = 0;
   1579    }
   1580    off += base;
   1581 
   1582    return loadResInfo32(ptr, off, prog->driver->io.suInfoBase);
   1583 }
   1584 
   1585 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
   1586 {
   1587    switch (su->tex.target.getEnum()) {
   1588    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
   1589    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1590    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1591    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
   1592                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
   1593                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1594    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
   1595    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
   1596    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1597    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1598    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1599    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1600    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1601    default:
   1602       assert(0);
   1603       return 0;
   1604    }
   1605 }
   1606 
   1607 bool
   1608 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
   1609 {
   1610    int mask = suq->tex.mask;
   1611    int dim = suq->tex.target.getDim();
   1612    int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
   1613    Value *ind = suq->getIndirectR();
   1614    int slot = suq->tex.r;
   1615    int c, d;
   1616 
   1617    for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
   1618       if (c >= arg || !(mask & 1))
   1619          continue;
   1620 
   1621       int offset;
   1622 
   1623       if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
   1624          offset = NVC0_SU_INFO_SIZE(2);
   1625       } else {
   1626          offset = NVC0_SU_INFO_SIZE(c);
   1627       }
   1628       bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset));
   1629       if (c == 2 && suq->tex.target.isCube())
   1630          bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
   1631                    bld.loadImm(NULL, 6));
   1632    }
   1633 
   1634    if (mask & 1) {
   1635       if (suq->tex.target.isMS()) {
   1636          Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0));
   1637          Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1));
   1638          Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
   1639          bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
   1640       } else {
   1641          bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
   1642       }
   1643    }
   1644 
   1645    bld.remove(suq);
   1646    return true;
   1647 }
   1648 
   1649 void
   1650 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
   1651 {
   1652    const int arg = tex->tex.target.getArgCount();
   1653    int slot = tex->tex.r;
   1654 
   1655    if (tex->tex.target == TEX_TARGET_2D_MS)
   1656       tex->tex.target = TEX_TARGET_2D;
   1657    else
   1658    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
   1659       tex->tex.target = TEX_TARGET_2D_ARRAY;
   1660    else
   1661       return;
   1662 
   1663    Value *x = tex->getSrc(0);
   1664    Value *y = tex->getSrc(1);
   1665    Value *s = tex->getSrc(arg - 1);
   1666 
   1667    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
   1668    Value *ind = tex->getIndirectR();
   1669 
   1670    Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0));
   1671    Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1));
   1672 
   1673    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
   1674    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
   1675 
   1676    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
   1677    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
   1678 
   1679    Value *dx = loadMsInfo32(ts, 0x0);
   1680    Value *dy = loadMsInfo32(ts, 0x4);
   1681 
   1682    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
   1683    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
   1684 
   1685    tex->setSrc(0, tx);
   1686    tex->setSrc(1, ty);
   1687    tex->moveSources(arg, -1);
   1688 }
   1689 
   1690 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
   1691 // They're computed from the coordinates using the surface info in c[] space.
   1692 void
   1693 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
   1694 {
   1695    Instruction *insn;
   1696    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
   1697    const bool raw =
   1698       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
   1699    const int slot = su->tex.r;
   1700    const int dim = su->tex.target.getDim();
   1701    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   1702    int c;
   1703    Value *zero = bld.mkImm(0);
   1704    Value *p1 = NULL;
   1705    Value *v;
   1706    Value *src[3];
   1707    Value *bf, *eau, *off;
   1708    Value *addr, *pred;
   1709    Value *ind = su->getIndirectR();
   1710 
   1711    off = bld.getScratch(4);
   1712    bf = bld.getScratch(4);
   1713    addr = bld.getSSA(8);
   1714    pred = bld.getScratch(1, FILE_PREDICATE);
   1715 
   1716    bld.setPosition(su, false);
   1717 
   1718    adjustCoordinatesMS(su);
   1719 
   1720    // calculate clamped coordinates
   1721    for (c = 0; c < arg; ++c) {
   1722       int dimc = c;
   1723 
   1724       if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
   1725          // The array index is stored in the Z component for 1D arrays.
   1726          dimc = 2;
   1727       }
   1728 
   1729       src[c] = bld.getScratch();
   1730       if (c == 0 && raw)
   1731          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X);
   1732       else
   1733          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc));
   1734       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
   1735          ->subOp = getSuClampSubOp(su, dimc);
   1736    }
   1737    for (; c < 3; ++c)
   1738       src[c] = zero;
   1739 
   1740    // set predicate output
   1741    if (su->tex.target == TEX_TARGET_BUFFER) {
   1742       src[0]->getInsn()->setFlagsDef(1, pred);
   1743    } else
   1744    if (su->tex.target.isArray() || su->tex.target.isCube()) {
   1745       p1 = bld.getSSA(1, FILE_PREDICATE);
   1746       src[dim]->getInsn()->setFlagsDef(1, p1);
   1747    }
   1748 
   1749    // calculate pixel offset
   1750    if (dim == 1) {
   1751       if (su->tex.target != TEX_TARGET_BUFFER)
   1752          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
   1753    } else
   1754    if (dim == 3) {
   1755       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C);
   1756       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
   1757          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
   1758 
   1759       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH);
   1760       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
   1761          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
   1762    } else {
   1763       assert(dim == 2);
   1764       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH);
   1765       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
   1766          ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
   1767          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
   1768    }
   1769 
   1770    // calculate effective address part 1
   1771    if (su->tex.target == TEX_TARGET_BUFFER) {
   1772       if (raw) {
   1773          bf = src[0];
   1774       } else {
   1775          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT);
   1776          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
   1777             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
   1778       }
   1779    } else {
   1780       Value *y = src[1];
   1781       Value *z = src[2];
   1782       uint16_t subOp = 0;
   1783 
   1784       switch (dim) {
   1785       case 1:
   1786          y = zero;
   1787          z = zero;
   1788          break;
   1789       case 2:
   1790          z = off;
   1791          if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
   1792             z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C);
   1793             subOp = NV50_IR_SUBOP_SUBFM_3D;
   1794          }
   1795          break;
   1796       default:
   1797          subOp = NV50_IR_SUBOP_SUBFM_3D;
   1798          assert(dim == 3);
   1799          break;
   1800       }
   1801       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
   1802       insn->subOp = subOp;
   1803       insn->setFlagsDef(1, pred);
   1804    }
   1805 
   1806    // part 2
   1807    v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR);
   1808 
   1809    if (su->tex.target == TEX_TARGET_BUFFER) {
   1810       eau = v;
   1811    } else {
   1812       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
   1813    }
   1814    // add array layer offset
   1815    if (su->tex.target.isArray() || su->tex.target.isCube()) {
   1816       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY);
   1817       if (dim == 1)
   1818          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
   1819             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
   1820       else
   1821          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
   1822             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
   1823       // combine predicates
   1824       assert(p1);
   1825       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
   1826    }
   1827 
   1828    if (atom) {
   1829       Value *lo = bf;
   1830       if (su->tex.target == TEX_TARGET_BUFFER) {
   1831          lo = zero;
   1832          bld.mkMov(off, bf);
   1833       }
   1834       //  bf == g[] address & 0xff
   1835       // eau == g[] address >> 8
   1836       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
   1837       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
   1838    } else
   1839    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
   1840       // Convert from u32 to u8 address format, which is what the library code
   1841       // doing SULDP currently uses.
   1842       // XXX: can SUEAU do this ?
   1843       // XXX: does it matter that we don't mask high bytes in bf ?
   1844       // Grrr.
   1845       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
   1846       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
   1847    }
   1848 
   1849    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
   1850 
   1851    if (atom && su->tex.target == TEX_TARGET_BUFFER)
   1852       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
   1853 
   1854    // let's just set it 0 for raw access and hope it works
   1855    v = raw ?
   1856       bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT);
   1857 
   1858    // get rid of old coordinate sources, make space for fmt info and predicate
   1859    su->moveSources(arg, 3 - arg);
   1860    // set 64 bit address and 32-bit format sources
   1861    su->setSrc(0, addr);
   1862    su->setSrc(1, v);
   1863    su->setSrc(2, pred);
   1864 
   1865    // prevent read fault when the image is not actually bound
   1866    CmpInstruction *pred1 =
   1867       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   1868                 TYPE_U32, bld.mkImm(0),
   1869                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
   1870 
   1871    if (su->op != OP_SUSTP && su->tex.format) {
   1872       const TexInstruction::ImgFormatDesc *format = su->tex.format;
   1873       int blockwidth = format->bits[0] + format->bits[1] +
   1874                        format->bits[2] + format->bits[3];
   1875 
   1876       // make sure that the format doesn't mismatch
   1877       assert(format->components != 0);
   1878       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
   1879                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
   1880                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
   1881                 pred1->getDef(0));
   1882    }
   1883    su->setPredicate(CC_NOT_P, pred1->getDef(0));
   1884 
   1885    // TODO: initialize def values to 0 when the surface operation is not
   1886    // performed (not needed for stores). Also, fix the "address bounds test"
   1887    // subtests from arb_shader_image_load_store-invalid for buffers, because it
   1888    // seems like that the predicate is not correctly set by suclamp.
   1889 }
   1890 
   1891 static DataType
   1892 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
   1893 {
   1894    switch (t->type) {
   1895    case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
   1896    case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
   1897    case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
   1898    case UINT:
   1899       return (t->bits[c] == 8 ? TYPE_U8 :
   1900               (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
   1901    case SINT:
   1902       return (t->bits[c] == 8 ? TYPE_S8 :
   1903               (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
   1904    }
   1905    return TYPE_NONE;
   1906 }
   1907 
   1908 static DataType
   1909 getDestType(const ImgType type) {
   1910    switch (type) {
   1911    case FLOAT:
   1912    case UNORM:
   1913    case SNORM:
   1914       return TYPE_F32;
   1915    case UINT:
   1916       return TYPE_U32;
   1917    case SINT:
   1918       return TYPE_S32;
   1919    default:
   1920       assert(!"Impossible type");
   1921       return TYPE_NONE;
   1922    }
   1923 }
   1924 
   1925 void
   1926 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
   1927 {
   1928    const TexInstruction::ImgFormatDesc *format = su->tex.format;
   1929    int width = format->bits[0] + format->bits[1] +
   1930       format->bits[2] + format->bits[3];
   1931    Value *untypedDst[4] = {};
   1932    Value *typedDst[4] = {};
   1933 
   1934    // We must convert this to a generic load.
   1935    su->op = OP_SULDB;
   1936 
   1937    su->dType = typeOfSize(width / 8);
   1938    su->sType = TYPE_U8;
   1939 
   1940    for (int i = 0; i < width / 32; i++)
   1941       untypedDst[i] = bld.getSSA();
   1942    if (width < 32)
   1943       untypedDst[0] = bld.getSSA();
   1944 
   1945    for (int i = 0; i < 4; i++) {
   1946       typedDst[i] = su->getDef(i);
   1947    }
   1948 
   1949    // Set the untyped dsts as the su's destinations
   1950    for (int i = 0; i < 4; i++)
   1951       su->setDef(i, untypedDst[i]);
   1952 
   1953    bld.setPosition(su, true);
   1954 
   1955    // Unpack each component into the typed dsts
   1956    int bits = 0;
   1957    for (int i = 0; i < 4; bits += format->bits[i], i++) {
   1958       if (!typedDst[i])
   1959          continue;
   1960       if (i >= format->components) {
   1961          if (format->type == FLOAT ||
   1962              format->type == UNORM ||
   1963              format->type == SNORM)
   1964             bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
   1965          else
   1966             bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
   1967          continue;
   1968       }
   1969 
   1970       // Get just that component's data into the relevant place
   1971       if (format->bits[i] == 32)
   1972          bld.mkMov(typedDst[i], untypedDst[i]);
   1973       else if (format->bits[i] == 16)
   1974          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
   1975                    getSrcType(format, i), untypedDst[i / 2])
   1976          ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
   1977       else if (format->bits[i] == 8)
   1978          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
   1979                    getSrcType(format, i), untypedDst[0])->subOp = i;
   1980       else {
   1981          bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
   1982                    bld.mkImm((bits % 32) | (format->bits[i] << 8)));
   1983          if (format->type == UNORM || format->type == SNORM)
   1984             bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
   1985       }
   1986 
   1987       // Normalize / convert as necessary
   1988       if (format->type == UNORM)
   1989          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
   1990       else if (format->type == SNORM)
   1991          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
   1992       else if (format->type == FLOAT && format->bits[i] < 16) {
   1993          bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
   1994          bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
   1995       }
   1996    }
   1997 
   1998    if (format->bgra) {
   1999       std::swap(typedDst[0], typedDst[2]);
   2000    }
   2001 }
   2002 
   2003 void
   2004 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
   2005 {
   2006    processSurfaceCoordsNVE4(su);
   2007 
   2008    if (su->op == OP_SULDP)
   2009       convertSurfaceFormat(su);
   2010 
   2011    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
   2012       assert(su->getPredicate());
   2013       Value *pred =
   2014          bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
   2015                     su->getPredicate(), su->getSrc(2));
   2016 
   2017       Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
   2018       red->subOp = su->subOp;
   2019       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
   2020       red->setSrc(1, su->getSrc(3));
   2021       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
   2022          red->setSrc(2, su->getSrc(4));
   2023       red->setIndirect(0, 0, su->getSrc(0));
   2024 
   2025       // make sure to initialize dst value when the atomic operation is not
   2026       // performed
   2027       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
   2028 
   2029       assert(su->cc == CC_NOT_P);
   2030       red->setPredicate(su->cc, pred);
   2031       mov->setPredicate(CC_P, pred);
   2032 
   2033       bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
   2034                 red->getDef(0), mov->getDef(0));
   2035 
   2036       delete_Instruction(bld.getProgram(), su);
   2037       handleCasExch(red, true);
   2038    }
   2039 
   2040    if (su->op == OP_SUSTB || su->op == OP_SUSTP)
   2041       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
   2042 }
   2043 
   2044 void
   2045 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
   2046 {
   2047    const int slot = su->tex.r;
   2048    const int dim = su->tex.target.getDim();
   2049    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   2050    int c;
   2051    Value *zero = bld.mkImm(0);
   2052    Value *src[3];
   2053    Value *v;
   2054    Value *ind = su->getIndirectR();
   2055 
   2056    bld.setPosition(su, false);
   2057 
   2058    adjustCoordinatesMS(su);
   2059 
   2060    if (ind) {
   2061       Value *ptr;
   2062       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
   2063       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
   2064       su->setIndirectR(ptr);
   2065    }
   2066 
   2067    // get surface coordinates
   2068    for (c = 0; c < arg; ++c)
   2069       src[c] = su->getSrc(c);
   2070    for (; c < 3; ++c)
   2071       src[c] = zero;
   2072 
   2073    // calculate pixel offset
   2074    if (su->op == OP_SULDP || su->op == OP_SUREDP) {
   2075       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE);
   2076       su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
   2077    }
   2078 
   2079    // add array layer offset
   2080    if (su->tex.target.isArray() || su->tex.target.isCube()) {
   2081       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY);
   2082       assert(dim > 1);
   2083       su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
   2084    }
   2085 
   2086    // prevent read fault when the image is not actually bound
   2087    CmpInstruction *pred =
   2088       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   2089                 TYPE_U32, bld.mkImm(0),
   2090                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
   2091    if (su->op != OP_SUSTP && su->tex.format) {
   2092       const TexInstruction::ImgFormatDesc *format = su->tex.format;
   2093       int blockwidth = format->bits[0] + format->bits[1] +
   2094                        format->bits[2] + format->bits[3];
   2095 
   2096       assert(format->components != 0);
   2097       // make sure that the format doesn't mismatch when it's not FMT_NONE
   2098       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
   2099                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
   2100                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
   2101                 pred->getDef(0));
   2102    }
   2103    su->setPredicate(CC_NOT_P, pred->getDef(0));
   2104 }
   2105 
   2106 void
   2107 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
   2108 {
   2109    if (su->tex.target == TEX_TARGET_1D_ARRAY) {
   2110       /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
   2111        * will simplify the lowering pass and the texture constraints. */
   2112       su->moveSources(1, 1);
   2113       su->setSrc(1, bld.loadImm(NULL, 0));
   2114       su->tex.target = TEX_TARGET_2D_ARRAY;
   2115    }
   2116 
   2117    processSurfaceCoordsNVC0(su);
   2118 
   2119    if (su->op == OP_SULDP)
   2120       convertSurfaceFormat(su);
   2121 
   2122    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
   2123       const int dim = su->tex.target.getDim();
   2124       const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   2125       LValue *addr = bld.getSSA(8);
   2126       Value *def = su->getDef(0);
   2127 
   2128       su->op = OP_SULEA;
   2129 
   2130       // Set the destination to the address
   2131       su->dType = TYPE_U64;
   2132       su->setDef(0, addr);
   2133       su->setDef(1, su->getPredicate());
   2134 
   2135       bld.setPosition(su, true);
   2136 
   2137       // Perform the atomic op
   2138       Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
   2139       red->subOp = su->subOp;
   2140       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
   2141       red->setSrc(1, su->getSrc(arg));
   2142       if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
   2143          red->setSrc(2, su->getSrc(arg + 1));
   2144       red->setIndirect(0, 0, addr);
   2145 
   2146       // make sure to initialize dst value when the atomic operation is not
   2147       // performed
   2148       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
   2149 
   2150       assert(su->cc == CC_NOT_P);
   2151       red->setPredicate(su->cc, su->getPredicate());
   2152       mov->setPredicate(CC_P, su->getPredicate());
   2153 
   2154       bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
   2155 
   2156       handleCasExch(red, false);
   2157    }
   2158 }
   2159 
   2160 void
   2161 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
   2162 {
   2163    const int slot = su->tex.r;
   2164    const int dim = su->tex.target.getDim();
   2165    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   2166    Value *ind = su->getIndirectR();
   2167    int pos = 0;
   2168 
   2169    bld.setPosition(su, false);
   2170 
   2171    // add texture handle
   2172    switch (su->op) {
   2173    case OP_SUSTP:
   2174       pos = 4;
   2175       break;
   2176    case OP_SUREDP:
   2177       pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
   2178       break;
   2179    default:
   2180       assert(pos == 0);
   2181       break;
   2182    }
   2183    su->setSrc(arg + pos, loadTexHandle(ind, slot + 32));
   2184 
   2185    // prevent read fault when the image is not actually bound
   2186    CmpInstruction *pred =
   2187       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   2188                 TYPE_U32, bld.mkImm(0),
   2189                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
   2190    if (su->op != OP_SUSTP && su->tex.format) {
   2191       const TexInstruction::ImgFormatDesc *format = su->tex.format;
   2192       int blockwidth = format->bits[0] + format->bits[1] +
   2193                        format->bits[2] + format->bits[3];
   2194 
   2195       assert(format->components != 0);
   2196       // make sure that the format doesn't mismatch when it's not FMT_NONE
   2197       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
   2198                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
   2199                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
   2200                 pred->getDef(0));
   2201    }
   2202    su->setPredicate(CC_NOT_P, pred->getDef(0));
   2203 }
   2204 
   2205 void
   2206 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
   2207 {
   2208    processSurfaceCoordsGM107(su);
   2209 
   2210    if (su->op == OP_SULDP)
   2211       convertSurfaceFormat(su);
   2212 
   2213    if (su->op == OP_SUREDP) {
   2214       Value *def = su->getDef(0);
   2215 
   2216       su->op = OP_SUREDB;
   2217       su->setDef(0, bld.getSSA());
   2218 
   2219       bld.setPosition(su, true);
   2220 
   2221       // make sure to initialize dst value when the atomic operation is not
   2222       // performed
   2223       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
   2224 
   2225       assert(su->cc == CC_NOT_P);
   2226       mov->setPredicate(CC_P, su->getPredicate());
   2227 
   2228       bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
   2229    }
   2230 }
   2231 
   2232 bool
   2233 NVC0LoweringPass::handleWRSV(Instruction *i)
   2234 {
   2235    Instruction *st;
   2236    Symbol *sym;
   2237    uint32_t addr;
   2238 
   2239    // must replace, $sreg are not writeable
   2240    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
   2241    if (addr >= 0x400)
   2242       return false;
   2243    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
   2244 
   2245    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
   2246                     i->getSrc(1));
   2247    st->perPatch = i->perPatch;
   2248 
   2249    bld.getBB()->remove(i);
   2250    return true;
   2251 }
   2252 
   2253 void
   2254 NVC0LoweringPass::handleLDST(Instruction *i)
   2255 {
   2256    if (i->src(0).getFile() == FILE_SHADER_INPUT) {
   2257       if (prog->getType() == Program::TYPE_COMPUTE) {
   2258          i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
   2259          i->getSrc(0)->reg.fileIndex = 0;
   2260       } else
   2261       if (prog->getType() == Program::TYPE_GEOMETRY &&
   2262           i->src(0).isIndirect(0)) {
   2263          // XXX: this assumes vec4 units
   2264          Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
   2265                                  i->getIndirect(0, 0), bld.mkImm(4));
   2266          i->setIndirect(0, 0, ptr);
   2267          i->op = OP_VFETCH;
   2268       } else {
   2269          i->op = OP_VFETCH;
   2270          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
   2271       }
   2272    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
   2273       if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
   2274           prog->getType() == Program::TYPE_COMPUTE) {
   2275          // The launch descriptor only allows to set up 8 CBs, but OpenGL
   2276          // requires at least 12 UBOs. To bypass this limitation, we store the
   2277          // addrs into the driver constbuf and we directly load from the global
   2278          // memory.
   2279          int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
   2280          Value *ind = i->getIndirect(0, 1);
   2281 
   2282          if (!ind && fileIndex == -1)
   2283             return;
   2284 
   2285          if (ind) {
   2286             // Clamp the UBO index when an indirect access is used to avoid
   2287             // loading information from the wrong place in the driver cb.
   2288             // TODO - synchronize the max with the driver.
   2289             ind = bld.mkOp2v(OP_MIN, TYPE_U32, ind,
   2290                              bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
   2291                                         ind, bld.loadImm(NULL, fileIndex)),
   2292                              bld.loadImm(NULL, 13));
   2293             fileIndex = 0;
   2294          }
   2295 
   2296          Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
   2297          Value *ptr = loadUboInfo64(ind, fileIndex * 16);
   2298          Value *length = loadUboLength32(ind, fileIndex * 16);
   2299          Value *pred = new_LValue(func, FILE_PREDICATE);
   2300          if (i->src(0).isIndirect(0)) {
   2301             bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
   2302             bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
   2303          }
   2304          i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   2305          i->setIndirect(0, 1, NULL);
   2306          i->setIndirect(0, 0, ptr);
   2307          bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
   2308          i->setPredicate(CC_NOT_P, pred);
   2309          Value *zero, *dst = i->getDef(0);
   2310          i->setDef(0, bld.getSSA());
   2311 
   2312          bld.setPosition(i, true);
   2313          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
   2314             ->setPredicate(CC_P, pred);
   2315          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
   2316       } else if (i->src(0).isIndirect(1)) {
   2317          Value *ptr;
   2318          if (i->src(0).isIndirect(0))
   2319             ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
   2320                              i->getIndirect(0, 1), bld.mkImm(0x1010),
   2321                              i->getIndirect(0, 0));
   2322          else
   2323             ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
   2324                              i->getIndirect(0, 1), bld.mkImm(16));
   2325          i->setIndirect(0, 1, NULL);
   2326          i->setIndirect(0, 0, ptr);
   2327          i->subOp = NV50_IR_SUBOP_LDC_IS;
   2328       }
   2329    } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
   2330       assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
   2331       i->op = OP_VFETCH;
   2332    } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
   2333       Value *ind = i->getIndirect(0, 1);
   2334       Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
   2335       // XXX come up with a way not to do this for EVERY little access but
   2336       // rather to batch these up somehow. Unfortunately we've lost the
   2337       // information about the field width by the time we get here.
   2338       Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
   2339       Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
   2340       Value *pred = new_LValue(func, FILE_PREDICATE);
   2341       if (i->src(0).isIndirect(0)) {
   2342          bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
   2343          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
   2344       }
   2345       i->setIndirect(0, 1, NULL);
   2346       i->setIndirect(0, 0, ptr);
   2347       i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   2348       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
   2349       i->setPredicate(CC_NOT_P, pred);
   2350       if (i->defExists(0)) {
   2351          Value *zero, *dst = i->getDef(0);
   2352          i->setDef(0, bld.getSSA());
   2353 
   2354          bld.setPosition(i, true);
   2355          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
   2356             ->setPredicate(CC_P, pred);
   2357          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
   2358       }
   2359    }
   2360 }
   2361 
   2362 void
   2363 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
   2364 {
   2365    Value *laneid = bld.getSSA();
   2366    Value *x, *y;
   2367 
   2368    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
   2369 
   2370    if (c == 0) {
   2371       x = dst;
   2372       y = NULL;
   2373    } else
   2374    if (c == 1) {
   2375       x = NULL;
   2376       y = dst;
   2377    } else {
   2378       assert(c == 2);
   2379       if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
   2380          bld.mkMov(dst, bld.loadImm(NULL, 0));
   2381          return;
   2382       }
   2383       x = bld.getSSA();
   2384       y = bld.getSSA();
   2385    }
   2386    if (x)
   2387       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
   2388    if (y)
   2389       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
   2390 
   2391    if (c == 2) {
   2392       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
   2393       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
   2394    }
   2395 }
   2396 
   2397 bool
   2398 NVC0LoweringPass::handleRDSV(Instruction *i)
   2399 {
   2400    Symbol *sym = i->getSrc(0)->asSym();
   2401    const SVSemantic sv = sym->reg.data.sv.sv;
   2402    Value *vtx = NULL;
   2403    Instruction *ld;
   2404    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
   2405 
   2406    if (addr >= 0x400) {
   2407       // mov $sreg
   2408       if (sym->reg.data.sv.index == 3) {
   2409          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
   2410          i->op = OP_MOV;
   2411          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
   2412       }
   2413       if (sv == SV_VERTEX_COUNT) {
   2414          bld.setPosition(i, true);
   2415          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
   2416       }
   2417       return true;
   2418    }
   2419 
   2420    switch (sv) {
   2421    case SV_POSITION:
   2422       assert(prog->getType() == Program::TYPE_FRAGMENT);
   2423       if (i->srcExists(1)) {
   2424          // Pass offset through to the interpolation logic
   2425          ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
   2426                            i->getDef(0), addr, NULL);
   2427          ld->setSrc(1, i->getSrc(1));
   2428       } else {
   2429          bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
   2430       }
   2431       break;
   2432    case SV_FACE:
   2433    {
   2434       Value *face = i->getDef(0);
   2435       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
   2436       if (i->dType == TYPE_F32) {
   2437          bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
   2438          bld.mkOp1(OP_NEG, TYPE_S32, face, face);
   2439          bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
   2440       }
   2441    }
   2442       break;
   2443    case SV_TESS_COORD:
   2444       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
   2445       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
   2446       break;
   2447    case SV_NTID:
   2448    case SV_NCTAID:
   2449    case SV_GRIDID:
   2450       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
   2451       if (sym->reg.data.sv.index == 3) {
   2452          i->op = OP_MOV;
   2453          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
   2454          return true;
   2455       }
   2456       // Fallthrough
   2457    case SV_WORK_DIM:
   2458       addr += prog->driver->prop.cp.gridInfoBase;
   2459       bld.mkLoad(TYPE_U32, i->getDef(0),
   2460                  bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
   2461                               TYPE_U32, addr), NULL);
   2462       break;
   2463    case SV_SAMPLE_INDEX:
   2464       // TODO: Properly pass source as an address in the PIX address space
   2465       // (which can be of the form [r0+offset]). But this is currently
   2466       // unnecessary.
   2467       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
   2468       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
   2469       break;
   2470    case SV_SAMPLE_POS: {
   2471       Value *off = new_LValue(func, FILE_GPR);
   2472       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
   2473       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
   2474       bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
   2475       bld.mkLoad(TYPE_F32,
   2476                  i->getDef(0),
   2477                  bld.mkSymbol(
   2478                        FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
   2479                        TYPE_U32, prog->driver->io.sampleInfoBase +
   2480                        4 * sym->reg.data.sv.index),
   2481                  off);
   2482       break;
   2483    }
   2484    case SV_SAMPLE_MASK: {
   2485       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
   2486       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
   2487       Instruction *sampleid =
   2488          bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
   2489       sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
   2490       Value *masked =
   2491          bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
   2492                     bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
   2493                                bld.loadImm(NULL, 1), sampleid->getDef(0)));
   2494       if (prog->driver->prop.fp.persampleInvocation) {
   2495          bld.mkMov(i->getDef(0), masked);
   2496       } else {
   2497          bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
   2498                    bld.mkImm(0))
   2499             ->subOp = 1;
   2500       }
   2501       break;
   2502    }
   2503    case SV_BASEVERTEX:
   2504    case SV_BASEINSTANCE:
   2505    case SV_DRAWID:
   2506       ld = bld.mkLoad(TYPE_U32, i->getDef(0),
   2507                       bld.mkSymbol(FILE_MEMORY_CONST,
   2508                                    prog->driver->io.auxCBSlot,
   2509                                    TYPE_U32,
   2510                                    prog->driver->io.drawInfoBase +
   2511                                    4 * (sv - SV_BASEVERTEX)),
   2512                       NULL);
   2513       break;
   2514    default:
   2515       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
   2516          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
   2517       if (prog->getType() == Program::TYPE_FRAGMENT) {
   2518          bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
   2519       } else {
   2520          ld = bld.mkFetch(i->getDef(0), i->dType,
   2521                           FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
   2522          ld->perPatch = i->perPatch;
   2523       }
   2524       break;
   2525    }
   2526    bld.getBB()->remove(i);
   2527    return true;
   2528 }
   2529 
   2530 bool
   2531 NVC0LoweringPass::handleDIV(Instruction *i)
   2532 {
   2533    if (!isFloatType(i->dType))
   2534       return true;
   2535    bld.setPosition(i, false);
   2536    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
   2537    i->op = OP_MUL;
   2538    i->setSrc(1, rcp->getDef(0));
   2539    return true;
   2540 }
   2541 
   2542 bool
   2543 NVC0LoweringPass::handleMOD(Instruction *i)
   2544 {
   2545    if (!isFloatType(i->dType))
   2546       return true;
   2547    LValue *value = bld.getScratch(typeSizeof(i->dType));
   2548    bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
   2549    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
   2550    bld.mkOp1(OP_TRUNC, i->dType, value, value);
   2551    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
   2552    i->op = OP_SUB;
   2553    i->setSrc(1, value);
   2554    return true;
   2555 }
   2556 
   2557 bool
   2558 NVC0LoweringPass::handleSQRT(Instruction *i)
   2559 {
   2560    if (i->dType == TYPE_F64) {
   2561       Value *pred = bld.getSSA(1, FILE_PREDICATE);
   2562       Value *zero = bld.loadImm(NULL, 0.0);
   2563       Value *dst = bld.getSSA(8);
   2564       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
   2565       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
   2566       bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
   2567       i->op = OP_MUL;
   2568       i->setSrc(1, dst);
   2569       // TODO: Handle this properly with a library function
   2570    } else {
   2571       bld.setPosition(i, true);
   2572       i->op = OP_RSQ;
   2573       bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
   2574    }
   2575 
   2576    return true;
   2577 }
   2578 
   2579 bool
   2580 NVC0LoweringPass::handlePOW(Instruction *i)
   2581 {
   2582    LValue *val = bld.getScratch();
   2583 
   2584    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
   2585    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
   2586    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
   2587 
   2588    i->op = OP_EX2;
   2589    i->setSrc(0, val);
   2590    i->setSrc(1, NULL);
   2591 
   2592    return true;
   2593 }
   2594 
   2595 bool
   2596 NVC0LoweringPass::handleEXPORT(Instruction *i)
   2597 {
   2598    if (prog->getType() == Program::TYPE_FRAGMENT) {
   2599       int id = i->getSrc(0)->reg.data.offset / 4;
   2600 
   2601       if (i->src(0).isIndirect(0)) // TODO, ugly
   2602          return false;
   2603       i->op = OP_MOV;
   2604       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
   2605       i->src(0).set(i->src(1));
   2606       i->setSrc(1, NULL);
   2607       i->setDef(0, new_LValue(func, FILE_GPR));
   2608       i->getDef(0)->reg.data.id = id;
   2609 
   2610       prog->maxGPR = MAX2(prog->maxGPR, id);
   2611    } else
   2612    if (prog->getType() == Program::TYPE_GEOMETRY) {
   2613       i->setIndirect(0, 1, gpEmitAddress);
   2614    }
   2615    return true;
   2616 }
   2617 
   2618 bool
   2619 NVC0LoweringPass::handleOUT(Instruction *i)
   2620 {
   2621    Instruction *prev = i->prev;
   2622    ImmediateValue stream, prevStream;
   2623 
   2624    // Only merge if the stream ids match. Also, note that the previous
   2625    // instruction would have already been lowered, so we take arg1 from it.
   2626    if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
   2627        i->src(0).getImmediate(stream) &&
   2628        prev->src(1).getImmediate(prevStream) &&
   2629        stream.reg.data.u32 == prevStream.reg.data.u32) {
   2630       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
   2631       delete_Instruction(prog, i);
   2632    } else {
   2633       assert(gpEmitAddress);
   2634       i->setDef(0, gpEmitAddress);
   2635       i->setSrc(1, i->getSrc(0));
   2636       i->setSrc(0, gpEmitAddress);
   2637    }
   2638    return true;
   2639 }
   2640 
   2641 // Generate a binary predicate if an instruction is predicated by
   2642 // e.g. an f32 value.
   2643 void
   2644 NVC0LoweringPass::checkPredicate(Instruction *insn)
   2645 {
   2646    Value *pred = insn->getPredicate();
   2647    Value *pdst;
   2648 
   2649    if (!pred || pred->reg.file == FILE_PREDICATE)
   2650       return;
   2651    pdst = new_LValue(func, FILE_PREDICATE);
   2652 
   2653    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
   2654    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
   2655 
   2656    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
   2657 
   2658    insn->setPredicate(insn->cc, pdst);
   2659 }
   2660 
   2661 //
   2662 // - add quadop dance for texturing
   2663 // - put FP outputs in GPRs
   2664 // - convert instruction sequences
   2665 //
   2666 bool
   2667 NVC0LoweringPass::visit(Instruction *i)
   2668 {
   2669    bool ret = true;
   2670    bld.setPosition(i, false);
   2671 
   2672    if (i->cc != CC_ALWAYS)
   2673       checkPredicate(i);
   2674 
   2675    switch (i->op) {
   2676    case OP_TEX:
   2677    case OP_TXB:
   2678    case OP_TXL:
   2679    case OP_TXF:
   2680    case OP_TXG:
   2681       return handleTEX(i->asTex());
   2682    case OP_TXD:
   2683       return handleTXD(i->asTex());
   2684    case OP_TXLQ:
   2685       return handleTXLQ(i->asTex());
   2686    case OP_TXQ:
   2687      return handleTXQ(i->asTex());
   2688    case OP_EX2:
   2689       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
   2690       i->setSrc(0, i->getDef(0));
   2691       break;
   2692    case OP_POW:
   2693       return handlePOW(i);
   2694    case OP_DIV:
   2695       return handleDIV(i);
   2696    case OP_MOD:
   2697       return handleMOD(i);
   2698    case OP_SQRT:
   2699       return handleSQRT(i);
   2700    case OP_EXPORT:
   2701       ret = handleEXPORT(i);
   2702       break;
   2703    case OP_EMIT:
   2704    case OP_RESTART:
   2705       return handleOUT(i);
   2706    case OP_RDSV:
   2707       return handleRDSV(i);
   2708    case OP_WRSV:
   2709       return handleWRSV(i);
   2710    case OP_STORE:
   2711    case OP_LOAD:
   2712       handleLDST(i);
   2713       break;
   2714    case OP_ATOM:
   2715    {
   2716       const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
   2717       handleATOM(i);
   2718       handleCasExch(i, cctl);
   2719    }
   2720       break;
   2721    case OP_SULDB:
   2722    case OP_SULDP:
   2723    case OP_SUSTB:
   2724    case OP_SUSTP:
   2725    case OP_SUREDB:
   2726    case OP_SUREDP:
   2727       if (targ->getChipset() >= NVISA_GM107_CHIPSET)
   2728          handleSurfaceOpGM107(i->asTex());
   2729       else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
   2730          handleSurfaceOpNVE4(i->asTex());
   2731       else
   2732          handleSurfaceOpNVC0(i->asTex());
   2733       break;
   2734    case OP_SUQ:
   2735       handleSUQ(i->asTex());
   2736       break;
   2737    case OP_BUFQ:
   2738       handleBUFQ(i);
   2739       break;
   2740    default:
   2741       break;
   2742    }
   2743 
   2744    /* Kepler+ has a special opcode to compute a new base address to be used
   2745     * for indirect loads.
   2746     *
   2747     * Maxwell+ has an additional similar requirement for indirect
   2748     * interpolation ops in frag shaders.
   2749     */
   2750    bool doAfetch = false;
   2751    if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
   2752        !i->perPatch &&
   2753        (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
   2754        i->src(0).isIndirect(0)) {
   2755       doAfetch = true;
   2756    }
   2757    if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
   2758        (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
   2759        i->src(0).isIndirect(0)) {
   2760       doAfetch = true;
   2761    }
   2762 
   2763    if (doAfetch) {
   2764       Value *addr = cloneShallow(func, i->getSrc(0));
   2765       Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
   2766                                       i->getSrc(0));
   2767       afetch->setIndirect(0, 0, i->getIndirect(0, 0));
   2768       addr->reg.data.offset = 0;
   2769       i->setSrc(0, addr);
   2770       i->setIndirect(0, 0, afetch->getDef(0));
   2771    }
   2772 
   2773    return ret;
   2774 }
   2775 
   2776 bool
   2777 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
   2778 {
   2779    if (stage == CG_STAGE_PRE_SSA) {
   2780       NVC0LoweringPass pass(prog);
   2781       return pass.run(prog, false, true);
   2782    } else
   2783    if (stage == CG_STAGE_POST_RA) {
   2784       NVC0LegalizePostRA pass(prog);
   2785       return pass.run(prog, false, true);
   2786    } else
   2787    if (stage == CG_STAGE_SSA) {
   2788       NVC0LegalizeSSA pass;
   2789       return pass.run(prog, false, true);
   2790    }
   2791    return false;
   2792 }
   2793 
   2794 } // namespace nv50_ir
   2795