Home | History | Annotate | Download | only in codegen
      1 /*
      2  * Copyright 2011 Christoph Bumiller
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "codegen/nv50_ir.h"
     24 #include "codegen/nv50_ir_build_util.h"
     25 
     26 #include "codegen/nv50_ir_target_nvc0.h"
     27 #include "codegen/nv50_ir_lowering_nvc0.h"
     28 
     29 #include <limits>
     30 
     31 namespace nv50_ir {
     32 
     33 #define QOP_ADD  0
     34 #define QOP_SUBR 1
     35 #define QOP_SUB  2
     36 #define QOP_MOV2 3
     37 
     38 //             UL UR LL LR
     39 #define QUADOP(q, r, s, t)                      \
     40    ((QOP_##q << 6) | (QOP_##r << 4) |           \
     41     (QOP_##s << 2) | (QOP_##t << 0))
     42 
     43 void
     44 NVC0LegalizeSSA::handleDIV(Instruction *i)
     45 {
     46    FlowInstruction *call;
     47    int builtin;
     48 
     49    bld.setPosition(i, false);
     50 
     51    // Generate movs to the input regs for the call we want to generate
     52    for (int s = 0; i->srcExists(s); ++s) {
     53       Instruction *ld = i->getSrc(s)->getInsn();
     54       assert(ld->getSrc(0) != NULL);
     55       // check if we are moving an immediate, propagate it in that case
     56       if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||
     57             !(ld->src(0).getFile() == FILE_IMMEDIATE))
     58          bld.mkMovToReg(s, i->getSrc(s));
     59       else {
     60          bld.mkMovToReg(s, ld->getSrc(0));
     61          // Clear the src, to make code elimination possible here before we
     62          // delete the instruction i later
     63          i->setSrc(s, NULL);
     64          if (ld->isDead())
     65             delete_Instruction(prog, ld);
     66       }
     67    }
     68 
     69    switch (i->dType) {
     70    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
     71    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
     72    default:
     73       return;
     74    }
     75    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
     76    bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
     77    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
     78    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
     79 
     80    call->fixed = 1;
     81    call->absolute = call->builtin = 1;
     82    call->target.builtin = builtin;
     83    delete_Instruction(prog, i);
     84 }
     85 
     86 void
     87 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
     88 {
     89    assert(i->dType == TYPE_F64);
     90    // There are instructions that will compute the high 32 bits of the 64-bit
     91    // float. We will just stick 0 in the bottom 32 bits.
     92 
     93    bld.setPosition(i, false);
     94 
     95    // 1. Take the source and it up.
     96    Value *src[2], *dst[2], *def = i->getDef(0);
     97    bld.mkSplit(src, 4, i->getSrc(0));
     98 
     99    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
    100    dst[0] = bld.loadImm(NULL, 0);
    101    dst[1] = bld.getSSA();
    102 
    103    // 3. The new version of the instruction takes the high 32 bits of the
    104    // source and outputs the high 32 bits of the destination.
    105    i->setSrc(0, src[1]);
    106    i->setDef(0, dst[1]);
    107    i->setType(TYPE_F32);
    108    i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
    109 
    110    // 4. Recombine the two dst pieces back into the original destination.
    111    bld.setPosition(i, true);
    112    bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
    113 }
    114 
    115 void
    116 NVC0LegalizeSSA::handleFTZ(Instruction *i)
    117 {
    118    // Only want to flush float inputs
    119    assert(i->sType == TYPE_F32);
    120 
    121    // If we're already flushing denorms (and NaN's) to zero, no need for this.
    122    if (i->dnz)
    123       return;
    124 
    125    // Only certain classes of operations can flush
    126    OpClass cls = prog->getTarget()->getOpClass(i->op);
    127    if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
    128        cls != OPCLASS_CONVERT)
    129       return;
    130 
    131    i->ftz = true;
    132 }
    133 
    134 void
    135 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
    136 {
    137    if (i->tex.levelZero)
    138       return;
    139 
    140    ImmediateValue lod;
    141 
    142    // The LOD argument comes right after the coordinates (before depth bias,
    143    // offsets, etc).
    144    int arg = i->tex.target.getArgCount();
    145 
    146    // SM30+ stores the indirect handle as a separate arg, which comes before
    147    // the LOD.
    148    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
    149        i->tex.rIndirectSrc >= 0)
    150       arg++;
    151    // SM20 stores indirect handle combined with array coordinate
    152    if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
    153        !i->tex.target.isArray() &&
    154        i->tex.rIndirectSrc >= 0)
    155       arg++;
    156 
    157    if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
    158       return;
    159 
    160    if (i->op == OP_TXL)
    161       i->op = OP_TEX;
    162    i->tex.levelZero = true;
    163    i->moveSources(arg + 1, -1);
    164 }
    165 
    166 void
    167 NVC0LegalizeSSA::handleShift(Instruction *lo)
    168 {
    169    Value *shift = lo->getSrc(1);
    170    Value *dst64 = lo->getDef(0);
    171    Value *src[2], *dst[2];
    172    operation op = lo->op;
    173 
    174    bld.setPosition(lo, false);
    175 
    176    bld.mkSplit(src, 4, lo->getSrc(0));
    177 
    178    // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
    179    // be completely emulated. For SM35+, we can use the more directed SHF
    180    // operations.
    181    if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {
    182       // The strategy here is to handle shifts >= 32 and less than 32 as
    183       // separate parts.
    184       //
    185       // For SHL:
    186       // If the shift is <= 32, then
    187       //   (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
    188       // If the shift is > 32, then
    189       //   (HI,LO) << x = (LO << (x - 32), 0)
    190       //
    191       // For SHR:
    192       // If the shift is <= 32, then
    193       //   (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
    194       // If the shift is > 32, then
    195       //   (HI,LO) >> x = (0, HI >> (x - 32))
    196       //
    197       // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
    198       // can use to our advantage. Also note the structural similarities
    199       // between the right/left cases. The main difference is swapping hi/lo
    200       // on input and output.
    201 
    202       Value *x32_minus_shift, *pred, *hi1, *hi2;
    203       DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;
    204       operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;
    205       if (op == OP_SHR)
    206          std::swap(src[0], src[1]);
    207       bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))
    208          ->src(0).mod = Modifier(NV50_IR_MOD_NEG);
    209       bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),
    210                 TYPE_U32, shift, bld.mkImm(32));
    211       // Compute HI (shift <= 32)
    212       bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),
    213                 bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),
    214                 bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))
    215          ->setPredicate(CC_P, pred);
    216       // Compute LO (all shift values)
    217       bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);
    218       // Compute HI (shift > 32)
    219       bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],
    220                 bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))
    221          ->setPredicate(CC_NOT_P, pred);
    222       bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);
    223       if (op == OP_SHR)
    224          std::swap(dst[0], dst[1]);
    225       bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
    226       delete_Instruction(prog, lo);
    227       return;
    228    }
    229 
    230    Instruction *hi = new_Instruction(func, op, TYPE_U32);
    231    lo->bb->insertAfter(lo, hi);
    232 
    233    hi->sType = lo->sType;
    234    lo->dType = TYPE_U32;
    235 
    236    hi->setDef(0, (dst[1] = bld.getSSA()));
    237    if (lo->op == OP_SHR)
    238       hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
    239    lo->setDef(0, (dst[0] = bld.getSSA()));
    240 
    241    bld.setPosition(hi, true);
    242 
    243    if (lo->op == OP_SHL)
    244       std::swap(hi, lo);
    245 
    246    hi->setSrc(0, new_ImmediateValue(prog, 0u));
    247    hi->setSrc(1, shift);
    248    hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
    249 
    250    lo->setSrc(0, src[0]);
    251    lo->setSrc(1, shift);
    252    lo->setSrc(2, src[1]);
    253 
    254    bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
    255 }
    256 
    257 void
    258 NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
    259 {
    260    DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
    261    Value *carry;
    262    Value *src0[2], *src1[2];
    263    bld.setPosition(cmp, false);
    264 
    265    bld.mkSplit(src0, 4, cmp->getSrc(0));
    266    bld.mkSplit(src1, 4, cmp->getSrc(1));
    267    bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
    268       ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));
    269    cmp->setFlagsSrc(cmp->srcCount(), carry);
    270    cmp->setSrc(0, src0[1]);
    271    cmp->setSrc(1, src1[1]);
    272    cmp->sType = hTy;
    273 }
    274 
    275 bool
    276 NVC0LegalizeSSA::visit(Function *fn)
    277 {
    278    bld.setProgram(fn->getProgram());
    279    return true;
    280 }
    281 
    282 bool
    283 NVC0LegalizeSSA::visit(BasicBlock *bb)
    284 {
    285    Instruction *next;
    286    for (Instruction *i = bb->getEntry(); i; i = next) {
    287       next = i->next;
    288 
    289       if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
    290          handleFTZ(i);
    291 
    292       switch (i->op) {
    293       case OP_DIV:
    294       case OP_MOD:
    295          if (i->sType != TYPE_F32)
    296             handleDIV(i);
    297          break;
    298       case OP_RCP:
    299       case OP_RSQ:
    300          if (i->dType == TYPE_F64)
    301             handleRCPRSQ(i);
    302          break;
    303       case OP_TXL:
    304       case OP_TXF:
    305          handleTEXLOD(i->asTex());
    306          break;
    307       case OP_SHR:
    308       case OP_SHL:
    309          if (typeSizeof(i->sType) == 8)
    310             handleShift(i);
    311          break;
    312       case OP_SET:
    313       case OP_SET_AND:
    314       case OP_SET_OR:
    315       case OP_SET_XOR:
    316          if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
    317             handleSET(i->asCmp());
    318          break;
    319       default:
    320          break;
    321       }
    322    }
    323    return true;
    324 }
    325 
    326 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
    327    : rZero(NULL),
    328      carry(NULL),
    329      pOne(NULL),
    330      needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
    331                 prog->getTarget()->getChipset() < 0x110)
    332 {
    333 }
    334 
    335 bool
    336 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
    337                                     const Instruction *early) const
    338 {
    339    if (early->bb == later->bb)
    340       return early->serial < later->serial;
    341    return later->bb->dominatedBy(early->bb);
    342 }
    343 
    344 void
    345 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
    346                               Instruction *usei, const Instruction *texi)
    347 {
    348    bool add = true;
    349    bool dominated = insnDominatedBy(usei, texi);
    350    // Uses before the tex have to all be included. Just because an earlier
    351    // instruction dominates another instruction doesn't mean that there's no
    352    // way to get from the tex to the later instruction. For example you could
    353    // have nested loops, with the tex in the inner loop, and uses before it in
    354    // both loops - even though the outer loop's instruction would dominate the
    355    // inner's, we still want a texbar before the inner loop's instruction.
    356    //
    357    // However we can still use the eliding logic between uses dominated by the
    358    // tex instruction, as that is unambiguously correct.
    359    if (dominated) {
    360       for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
    361          if (it->after) {
    362             if (insnDominatedBy(usei, it->insn)) {
    363                add = false;
    364                break;
    365             }
    366             if (insnDominatedBy(it->insn, usei)) {
    367                it = uses.erase(it);
    368                continue;
    369             }
    370          }
    371          ++it;
    372       }
    373    }
    374    if (add)
    375       uses.push_back(TexUse(usei, texi, dominated));
    376 }
    377 
    378 // While it might be tempting to use the an algorithm that just looks at tex
    379 // uses, not all texture results are guaranteed to be used on all paths. In
    380 // the case where along some control flow path a texture result is never used,
    381 // we might reuse that register for something else, creating a
    382 // write-after-write hazard. So we have to manually look through all
    383 // instructions looking for ones that reference the registers in question.
    384 void
    385 NVC0LegalizePostRA::findFirstUses(
    386    Instruction *texi, std::list<TexUse> &uses)
    387 {
    388    int minGPR = texi->def(0).rep()->reg.data.id;
    389    int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
    390 
    391    unordered_set<const BasicBlock *> visited;
    392    findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
    393 }
    394 
    395 void
    396 NVC0LegalizePostRA::findFirstUsesBB(
    397    int minGPR, int maxGPR, Instruction *start,
    398    const Instruction *texi, std::list<TexUse> &uses,
    399    unordered_set<const BasicBlock *> &visited)
    400 {
    401    const BasicBlock *bb = start->bb;
    402 
    403    // We don't process the whole bb the first time around. This is correct,
    404    // however we might be in a loop and hit this BB again, and need to process
    405    // the full thing. So only mark a bb as visited if we processed it from the
    406    // beginning.
    407    if (start == bb->getEntry()) {
    408       if (visited.find(bb) != visited.end())
    409          return;
    410       visited.insert(bb);
    411    }
    412 
    413    for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
    414       if (insn->isNop())
    415          continue;
    416 
    417       for (int d = 0; insn->defExists(d); ++d) {
    418          const Value *def = insn->def(d).rep();
    419          if (insn->def(d).getFile() != FILE_GPR ||
    420              def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
    421              def->reg.data.id > maxGPR)
    422             continue;
    423          addTexUse(uses, insn, texi);
    424          return;
    425       }
    426 
    427       for (int s = 0; insn->srcExists(s); ++s) {
    428          const Value *src = insn->src(s).rep();
    429          if (insn->src(s).getFile() != FILE_GPR ||
    430              src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
    431              src->reg.data.id > maxGPR)
    432             continue;
    433          addTexUse(uses, insn, texi);
    434          return;
    435       }
    436    }
    437 
    438    for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
    439       findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
    440                       texi, uses, visited);
    441    }
    442 }
    443 
    444 // Texture barriers:
    445 // This pass is a bit long and ugly and can probably be optimized.
    446 //
    447 // 1. obtain a list of TEXes and their outputs' first use(s)
    448 // 2. calculate the barrier level of each first use (minimal number of TEXes,
    449 //    over all paths, between the TEX and the use in question)
    450 // 3. for each barrier, if all paths from the source TEX to that barrier
    451 //    contain a barrier of lesser level, it can be culled
    452 bool
    453 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
    454 {
    455    std::list<TexUse> *uses;
    456    std::vector<Instruction *> texes;
    457    std::vector<int> bbFirstTex;
    458    std::vector<int> bbFirstUse;
    459    std::vector<int> texCounts;
    460    std::vector<TexUse> useVec;
    461    ArrayList insns;
    462 
    463    fn->orderInstructions(insns);
    464 
    465    texCounts.resize(fn->allBBlocks.getSize(), 0);
    466    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
    467    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
    468 
    469    // tag BB CFG nodes by their id for later
    470    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
    471       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
    472       if (bb)
    473          bb->cfg.tag = bb->getId();
    474    }
    475 
    476    // gather the first uses for each TEX
    477    for (int i = 0; i < insns.getSize(); ++i) {
    478       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
    479       if (isTextureOp(tex->op)) {
    480          texes.push_back(tex);
    481          if (!texCounts.at(tex->bb->getId()))
    482             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
    483          texCounts[tex->bb->getId()]++;
    484       }
    485    }
    486    insns.clear();
    487    if (texes.empty())
    488       return false;
    489    uses = new std::list<TexUse>[texes.size()];
    490    if (!uses)
    491       return false;
    492    for (size_t i = 0; i < texes.size(); ++i) {
    493       findFirstUses(texes[i], uses[i]);
    494    }
    495 
    496    // determine the barrier level at each use
    497    for (size_t i = 0; i < texes.size(); ++i) {
    498       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
    499            ++u) {
    500          BasicBlock *tb = texes[i]->bb;
    501          BasicBlock *ub = u->insn->bb;
    502          if (tb == ub) {
    503             u->level = 0;
    504             for (size_t j = i + 1; j < texes.size() &&
    505                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
    506                  ++j)
    507                u->level++;
    508          } else {
    509             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
    510                                                       &ub->cfg, texCounts);
    511             if (u->level < 0) {
    512                WARN("Failed to find path TEX -> TEXBAR\n");
    513                u->level = 0;
    514                continue;
    515             }
    516             // this counted all TEXes in the origin block, correct that
    517             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
    518             // and did not count the TEXes in the destination block, add those
    519             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
    520                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
    521                  ++j)
    522                u->level++;
    523          }
    524          assert(u->level >= 0);
    525          useVec.push_back(*u);
    526       }
    527    }
    528    delete[] uses;
    529 
    530    // insert the barriers
    531    for (size_t i = 0; i < useVec.size(); ++i) {
    532       Instruction *prev = useVec[i].insn->prev;
    533       if (useVec[i].level < 0)
    534          continue;
    535       if (prev && prev->op == OP_TEXBAR) {
    536          if (prev->subOp > useVec[i].level)
    537             prev->subOp = useVec[i].level;
    538          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
    539       } else {
    540          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
    541          bar->fixed = 1;
    542          bar->subOp = useVec[i].level;
    543          // make use explicit to ease latency calculation
    544          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
    545          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
    546       }
    547    }
    548 
    549    if (fn->getProgram()->optLevel < 3)
    550       return true;
    551 
    552    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
    553 
    554    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
    555    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
    556    limitS.resize(fn->allBBlocks.getSize());
    557 
    558    // cull unneeded barriers (should do that earlier, but for simplicity)
    559    IteratorRef bi = fn->cfg.iteratorCFG();
    560    // first calculate min/max outstanding TEXes for each BB
    561    for (bi->reset(); !bi->end(); bi->next()) {
    562       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
    563       BasicBlock *bb = BasicBlock::get(n);
    564       int min = 0;
    565       int max = std::numeric_limits<int>::max();
    566       for (Instruction *i = bb->getFirst(); i; i = i->next) {
    567          if (isTextureOp(i->op)) {
    568             min++;
    569             if (max < std::numeric_limits<int>::max())
    570                max++;
    571          } else
    572          if (i->op == OP_TEXBAR) {
    573             min = MIN2(min, i->subOp);
    574             max = MIN2(max, i->subOp);
    575          }
    576       }
    577       // limits when looking at an isolated block
    578       limitS[bb->getId()].min = min;
    579       limitS[bb->getId()].max = max;
    580    }
    581    // propagate the min/max values
    582    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
    583       for (bi->reset(); !bi->end(); bi->next()) {
    584          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
    585          BasicBlock *bb = BasicBlock::get(n);
    586          const int bbId = bb->getId();
    587          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
    588             BasicBlock *in = BasicBlock::get(ei.getNode());
    589             const int inId = in->getId();
    590             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
    591             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
    592          }
    593          // I just hope this is correct ...
    594          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
    595             // no barrier
    596             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
    597             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
    598          } else {
    599             // block contained a barrier
    600             limitB[bbId].min = MIN2(limitS[bbId].max,
    601                                     limitT[bbId].min + limitS[bbId].min);
    602             limitB[bbId].max = MIN2(limitS[bbId].max,
    603                                     limitT[bbId].max + limitS[bbId].min);
    604          }
    605       }
    606    }
    607    // finally delete unnecessary barriers
    608    for (bi->reset(); !bi->end(); bi->next()) {
    609       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
    610       BasicBlock *bb = BasicBlock::get(n);
    611       Instruction *prev = NULL;
    612       Instruction *next;
    613       int max = limitT[bb->getId()].max;
    614       for (Instruction *i = bb->getFirst(); i; i = next) {
    615          next = i->next;
    616          if (i->op == OP_TEXBAR) {
    617             if (i->subOp >= max) {
    618                delete_Instruction(prog, i);
    619                i = NULL;
    620             } else {
    621                max = i->subOp;
    622                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
    623                   delete_Instruction(prog, prev);
    624                   prev = NULL;
    625                }
    626             }
    627          } else
    628          if (isTextureOp(i->op)) {
    629             max++;
    630          }
    631          if (i && !i->isNop())
    632             prev = i;
    633       }
    634    }
    635    return true;
    636 }
    637 
    638 bool
    639 NVC0LegalizePostRA::visit(Function *fn)
    640 {
    641    if (needTexBar)
    642       insertTextureBarriers(fn);
    643 
    644    rZero = new_LValue(fn, FILE_GPR);
    645    pOne = new_LValue(fn, FILE_PREDICATE);
    646    carry = new_LValue(fn, FILE_FLAGS);
    647 
    648    rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
    649    carry->reg.data.id = 0;
    650    pOne->reg.data.id = 7;
    651 
    652    return true;
    653 }
    654 
    655 void
    656 NVC0LegalizePostRA::replaceZero(Instruction *i)
    657 {
    658    for (int s = 0; i->srcExists(s); ++s) {
    659       if (s == 2 && i->op == OP_SUCLAMP)
    660          continue;
    661       if (s == 1 && i->op == OP_SHLADD)
    662          continue;
    663       ImmediateValue *imm = i->getSrc(s)->asImm();
    664       if (imm) {
    665          if (i->op == OP_SELP && s == 2) {
    666             i->setSrc(s, pOne);
    667             if (imm->reg.data.u64 == 0)
    668                i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
    669          } else if (imm->reg.data.u64 == 0) {
    670             i->setSrc(s, rZero);
    671          }
    672       }
    673    }
    674 }
    675 
    676 // replace CONT with BRA for single unconditional continue
    677 bool
    678 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
    679 {
    680    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
    681       return false;
    682    Graph::EdgeIterator ei = bb->cfg.incident();
    683    if (ei.getType() != Graph::Edge::BACK)
    684       ei.next();
    685    if (ei.getType() != Graph::Edge::BACK)
    686       return false;
    687    BasicBlock *contBB = BasicBlock::get(ei.getNode());
    688 
    689    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
    690        contBB->getExit()->getPredicate())
    691       return false;
    692    contBB->getExit()->op = OP_BRA;
    693    bb->remove(bb->getEntry()); // delete PRECONT
    694 
    695    ei.next();
    696    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
    697    return true;
    698 }
    699 
    700 // replace branches to join blocks with join ops
    701 void
    702 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
    703 {
    704    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
    705       return;
    706    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
    707       BasicBlock *in = BasicBlock::get(ei.getNode());
    708       Instruction *exit = in->getExit();
    709       if (!exit) {
    710          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
    711          // there should always be a terminator instruction
    712          WARN("inserted missing terminator in BB:%i\n", in->getId());
    713       } else
    714       if (exit->op == OP_BRA) {
    715          exit->op = OP_JOIN;
    716          exit->asFlow()->limit = 1; // must-not-propagate marker
    717       }
    718    }
    719    bb->remove(bb->getEntry());
    720 }
    721 
    722 bool
    723 NVC0LegalizePostRA::visit(BasicBlock *bb)
    724 {
    725    Instruction *i, *next;
    726 
    727    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
    728    for (i = bb->getFirst(); i; i = next) {
    729       next = i->next;
    730       if (i->op == OP_EMIT || i->op == OP_RESTART) {
    731          if (!i->getDef(0)->refCount())
    732             i->setDef(0, NULL);
    733          if (i->src(0).getFile() == FILE_IMMEDIATE)
    734             i->setSrc(0, rZero); // initial value must be 0
    735          replaceZero(i);
    736       } else
    737       if (i->isNop()) {
    738          bb->remove(i);
    739       } else
    740       if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
    741           prog->getType() != Program::TYPE_COMPUTE) {
    742          // It seems like barriers are never required for tessellation since
    743          // the warp size is 32, and there are always at most 32 tcs threads.
    744          bb->remove(i);
    745       } else
    746       if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
    747          int offset = i->src(0).get()->reg.data.offset;
    748          if (abs(offset) >= 0x10000)
    749             i->src(0).get()->reg.fileIndex += offset >> 16;
    750          i->src(0).get()->reg.data.offset = (int)(short)offset;
    751       } else {
    752          // TODO: Move this to before register allocation for operations that
    753          // need the $c register !
    754          if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
    755             Instruction *hi;
    756             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
    757             if (hi)
    758                next = hi;
    759          }
    760 
    761          if (i->op != OP_MOV && i->op != OP_PFETCH)
    762             replaceZero(i);
    763       }
    764    }
    765    if (!bb->getEntry())
    766       return true;
    767 
    768    if (!tryReplaceContWithBra(bb))
    769       propagateJoin(bb);
    770 
    771    return true;
    772 }
    773 
    774 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
    775 {
    776    bld.setProgram(prog);
    777 }
    778 
    779 bool
    780 NVC0LoweringPass::visit(Function *fn)
    781 {
    782    if (prog->getType() == Program::TYPE_GEOMETRY) {
    783       assert(!strncmp(fn->getName(), "MAIN", 4));
    784       // TODO: when we generate actual functions pass this value along somehow
    785       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
    786       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
    787       if (fn->cfgExit) {
    788          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
    789          bld.mkMovToReg(0, gpEmitAddress);
    790       }
    791    }
    792    return true;
    793 }
    794 
    795 bool
    796 NVC0LoweringPass::visit(BasicBlock *bb)
    797 {
    798    return true;
    799 }
    800 
    801 inline Value *
    802 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
    803 {
    804    uint8_t b = prog->driver->io.auxCBSlot;
    805    uint32_t off = prog->driver->io.texBindBase + slot * 4;
    806 
    807    if (ptr)
    808       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
    809 
    810    return bld.
    811       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
    812 }
    813 
    814 // move array source to first slot, convert to u16, add indirections
    815 bool
    816 NVC0LoweringPass::handleTEX(TexInstruction *i)
    817 {
    818    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    819    const int arg = i->tex.target.getArgCount();
    820    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
    821    const int chipset = prog->getTarget()->getChipset();
    822 
    823    /* Only normalize in the non-explicit derivatives case. For explicit
    824     * derivatives, this is handled in handleManualTXD.
    825     */
    826    if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
    827       Value *src[3], *val;
    828       int c;
    829       for (c = 0; c < 3; ++c)
    830          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
    831       val = bld.getScratch();
    832       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
    833       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
    834       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
    835       for (c = 0; c < 3; ++c) {
    836          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
    837                                  i->getSrc(c), val));
    838       }
    839    }
    840 
    841    // Arguments to the TEX instruction are a little insane. Even though the
    842    // encoding is identical between SM20 and SM30, the arguments mean
    843    // different things between Fermi and Kepler+. A lot of arguments are
    844    // optional based on flags passed to the instruction. This summarizes the
    845    // order of things.
    846    //
    847    // Fermi:
    848    //  array/indirect
    849    //  coords
    850    //  sample
    851    //  lod bias
    852    //  depth compare
    853    //  offsets:
    854    //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
    855    //    - other: 4 bits each, single reg
    856    //
    857    // Kepler+:
    858    //  indirect handle
    859    //  array (+ offsets for txd in upper 16 bits)
    860    //  coords
    861    //  sample
    862    //  lod bias
    863    //  depth compare
    864    //  offsets (same as fermi, except txd which takes it with array)
    865    //
    866    // Maxwell (tex):
    867    //  array
    868    //  coords
    869    //  indirect handle
    870    //  sample
    871    //  lod bias
    872    //  depth compare
    873    //  offsets
    874    //
    875    // Maxwell (txd):
    876    //  indirect handle
    877    //  coords
    878    //  array + offsets
    879    //  derivatives
    880 
    881    if (chipset >= NVISA_GK104_CHIPSET) {
    882       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
    883          // XXX this ignores tsc, and assumes a 1:1 mapping
    884          assert(i->tex.rIndirectSrc >= 0);
    885          if (!i->tex.bindless) {
    886             Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
    887             i->tex.r = 0xff;
    888             i->tex.s = 0x1f;
    889             i->setIndirectR(hnd);
    890          }
    891          i->setIndirectS(NULL);
    892       } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
    893          if (i->tex.r == 0xffff)
    894             i->tex.r = prog->driver->io.fbtexBindBase / 4;
    895          else
    896             i->tex.r += prog->driver->io.texBindBase / 4;
    897          i->tex.s  = 0; // only a single cX[] value possible here
    898       } else {
    899          Value *hnd = bld.getScratch();
    900          Value *rHnd = loadTexHandle(NULL, i->tex.r);
    901          Value *sHnd = loadTexHandle(NULL, i->tex.s);
    902 
    903          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
    904 
    905          i->tex.r = 0; // not used for indirect tex
    906          i->tex.s = 0;
    907          i->setIndirectR(hnd);
    908       }
    909       if (i->tex.target.isArray()) {
    910          LValue *layer = new_LValue(func, FILE_GPR);
    911          Value *src = i->getSrc(lyr);
    912          const int sat = (i->op == OP_TXF) ? 1 : 0;
    913          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
    914          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
    915          if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
    916             for (int s = dim; s >= 1; --s)
    917                i->setSrc(s, i->getSrc(s - 1));
    918             i->setSrc(0, layer);
    919          } else {
    920             i->setSrc(dim, layer);
    921          }
    922       }
    923       // Move the indirect reference to the first place
    924       if (i->tex.rIndirectSrc >= 0 && (
    925                 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
    926          Value *hnd = i->getIndirectR();
    927 
    928          i->setIndirectR(NULL);
    929          i->moveSources(0, 1);
    930          i->setSrc(0, hnd);
    931          i->tex.rIndirectSrc = 0;
    932          i->tex.sIndirectSrc = -1;
    933       }
    934       // Move the indirect reference to right after the coords
    935       else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
    936          Value *hnd = i->getIndirectR();
    937 
    938          i->setIndirectR(NULL);
    939          i->moveSources(arg, 1);
    940          i->setSrc(arg, hnd);
    941          i->tex.rIndirectSrc = 0;
    942          i->tex.sIndirectSrc = -1;
    943       }
    944    } else
    945    // (nvc0) generate and move the tsc/tic/array source to the front
    946    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
    947       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
    948 
    949       Value *ticRel = i->getIndirectR();
    950       Value *tscRel = i->getIndirectS();
    951 
    952       if (i->tex.r == 0xffff) {
    953          i->tex.r = 0x20;
    954          i->tex.s = 0x10;
    955       }
    956 
    957       if (ticRel) {
    958          i->setSrc(i->tex.rIndirectSrc, NULL);
    959          if (i->tex.r)
    960             ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
    961                                 ticRel, bld.mkImm(i->tex.r));
    962       }
    963       if (tscRel) {
    964          i->setSrc(i->tex.sIndirectSrc, NULL);
    965          if (i->tex.s)
    966             tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
    967                                 tscRel, bld.mkImm(i->tex.s));
    968       }
    969 
    970       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
    971       if (arrayIndex) {
    972          for (int s = dim; s >= 1; --s)
    973             i->setSrc(s, i->getSrc(s - 1));
    974          i->setSrc(0, arrayIndex);
    975       } else {
    976          i->moveSources(0, 1);
    977       }
    978 
    979       if (arrayIndex) {
    980          int sat = (i->op == OP_TXF) ? 1 : 0;
    981          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
    982          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
    983       } else {
    984          bld.loadImm(src, 0);
    985       }
    986 
    987       if (ticRel)
    988          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
    989       if (tscRel)
    990          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
    991 
    992       i->setSrc(0, src);
    993    }
    994 
    995    // For nvc0, the sample id has to be in the second operand, as the offset
    996    // does. Right now we don't know how to pass both in, and this case can't
    997    // happen with OpenGL. On nve0, the sample id is part of the texture
    998    // coordinate argument.
    999    assert(chipset >= NVISA_GK104_CHIPSET ||
   1000           !i->tex.useOffsets || !i->tex.target.isMS());
   1001 
   1002    // offset is between lod and dc
   1003    if (i->tex.useOffsets) {
   1004       int n, c;
   1005       int s = i->srcCount(0xff, true);
   1006       if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
   1007          if (i->tex.target.isShadow())
   1008             s--;
   1009          if (i->srcExists(s)) // move potential predicate out of the way
   1010             i->moveSources(s, 1);
   1011          if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
   1012             i->moveSources(s + 1, 1);
   1013       }
   1014       if (i->op == OP_TXG) {
   1015          // Either there is 1 offset, which goes into the 2 low bytes of the
   1016          // first source, or there are 4 offsets, which go into 2 sources (8
   1017          // values, 1 byte each).
   1018          Value *offs[2] = {NULL, NULL};
   1019          for (n = 0; n < i->tex.useOffsets; n++) {
   1020             for (c = 0; c < 2; ++c) {
   1021                if ((n % 2) == 0 && c == 0)
   1022                   bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
   1023                else
   1024                   bld.mkOp3(OP_INSBF, TYPE_U32,
   1025                             offs[n / 2],
   1026                             i->offset[n][c].get(),
   1027                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
   1028                             offs[n / 2]);
   1029             }
   1030          }
   1031          i->setSrc(s, offs[0]);
   1032          if (offs[1])
   1033             i->setSrc(s + 1, offs[1]);
   1034       } else {
   1035          unsigned imm = 0;
   1036          assert(i->tex.useOffsets == 1);
   1037          for (c = 0; c < 3; ++c) {
   1038             ImmediateValue val;
   1039             if (!i->offset[0][c].getImmediate(val))
   1040                assert(!"non-immediate offset passed to non-TXG");
   1041             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
   1042          }
   1043          if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
   1044             // The offset goes into the upper 16 bits of the array index. So
   1045             // create it if it's not already there, and INSBF it if it already
   1046             // is.
   1047             s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
   1048             if (chipset >= NVISA_GM107_CHIPSET)
   1049                s += dim;
   1050             if (i->tex.target.isArray()) {
   1051                Value *offset = bld.getScratch();
   1052                bld.mkOp3(OP_INSBF, TYPE_U32, offset,
   1053                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
   1054                          i->getSrc(s));
   1055                i->setSrc(s, offset);
   1056             } else {
   1057                i->moveSources(s, 1);
   1058                i->setSrc(s, bld.loadImm(NULL, imm << 16));
   1059             }
   1060          } else {
   1061             i->setSrc(s, bld.loadImm(NULL, imm));
   1062          }
   1063       }
   1064    }
   1065 
   1066    if (chipset >= NVISA_GK104_CHIPSET) {
   1067       //
   1068       // If TEX requires more than 4 sources, the 2nd register tuple must be
   1069       // aligned to 4, even if it consists of just a single 4-byte register.
   1070       //
   1071       // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
   1072       //
   1073       int s = i->srcCount(0xff, true);
   1074       if (s > 4 && s < 7) {
   1075          if (i->srcExists(s)) // move potential predicate out of the way
   1076             i->moveSources(s, 7 - s);
   1077          while (s < 7)
   1078             i->setSrc(s++, bld.loadImm(NULL, 0));
   1079       }
   1080    }
   1081 
   1082    return true;
   1083 }
   1084 
   1085 bool
   1086 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
   1087 {
   1088    // Always done from the l0 perspective. This is the way that NVIDIA's
   1089    // driver does it, and doing it from the "current" lane's perpsective
   1090    // doesn't seem to always work for reasons that aren't altogether clear,
   1091    // even in frag shaders.
   1092    //
   1093    // Note that we must move not only the coordinates into lane0, but also all
   1094    // ancillary arguments, like array indices and depth compare as they may
   1095    // differ between lanes. Offsets for TXD are supposed to be uniform, so we
   1096    // leave them alone.
   1097    static const uint8_t qOps[2] =
   1098       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
   1099 
   1100    Value *def[4][4];
   1101    Value *crd[3], *arr[2], *shadow;
   1102    Instruction *tex;
   1103    Value *zero = bld.loadImm(bld.getSSA(), 0);
   1104    int l, c;
   1105    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
   1106 
   1107    // This function is invoked after handleTEX lowering, so we have to expect
   1108    // the arguments in the order that the hw wants them. For Fermi, array and
   1109    // indirect are both in the leading arg, while for Kepler, array and
   1110    // indirect are separate (and both precede the coordinates). Maxwell is
   1111    // handled in a separate function.
   1112    int array;
   1113    if (targ->getChipset() < NVISA_GK104_CHIPSET)
   1114       array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
   1115    else
   1116       array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
   1117 
   1118    i->op = OP_TEX; // no need to clone dPdx/dPdy later
   1119 
   1120    for (c = 0; c < dim; ++c)
   1121       crd[c] = bld.getScratch();
   1122    for (c = 0; c < array; ++c)
   1123       arr[c] = bld.getScratch();
   1124    shadow = bld.getScratch();
   1125 
   1126    for (l = 0; l < 4; ++l) {
   1127       Value *src[3], *val;
   1128 
   1129       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
   1130       // we're using the texture result from lane 0 in all cases, so make sure
   1131       // that lane 0 is pointing at the proper array index, indirect value,
   1132       // and depth compare.
   1133       if (l != 0) {
   1134          for (c = 0; c < array; ++c)
   1135             bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);
   1136          if (i->tex.target.isShadow()) {
   1137             // The next argument after coords is the depth compare
   1138             bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);
   1139          }
   1140       }
   1141       // mov position coordinates from lane l to all lanes
   1142       for (c = 0; c < dim; ++c)
   1143          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
   1144       // add dPdx from lane l to lanes dx
   1145       for (c = 0; c < dim; ++c)
   1146          bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);
   1147       // add dPdy from lane l to lanes dy
   1148       for (c = 0; c < dim; ++c)
   1149          bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);
   1150       // normalize cube coordinates
   1151       if (i->tex.target.isCube()) {
   1152          for (c = 0; c < 3; ++c)
   1153             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
   1154          val = bld.getScratch();
   1155          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
   1156          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
   1157          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
   1158          for (c = 0; c < 3; ++c)
   1159             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
   1160       } else {
   1161          for (c = 0; c < dim; ++c)
   1162             src[c] = crd[c];
   1163       }
   1164       // texture
   1165       bld.insert(tex = cloneForward(func, i));
   1166       if (l != 0) {
   1167          for (c = 0; c < array; ++c)
   1168             tex->setSrc(c, arr[c]);
   1169          if (i->tex.target.isShadow())
   1170             tex->setSrc(array + dim, shadow);
   1171       }
   1172       for (c = 0; c < dim; ++c)
   1173          tex->setSrc(c + array, src[c]);
   1174       // broadcast results from lane 0 to all lanes so that the moves *into*
   1175       // the target lane pick up the proper value.
   1176       if (l != 0)
   1177          for (c = 0; i->defExists(c); ++c)
   1178             bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);
   1179       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
   1180 
   1181       // save results
   1182       for (c = 0; i->defExists(c); ++c) {
   1183          Instruction *mov;
   1184          def[c][l] = bld.getSSA();
   1185          mov = bld.mkMov(def[c][l], tex->getDef(c));
   1186          mov->fixed = 1;
   1187          mov->lanes = 1 << l;
   1188       }
   1189    }
   1190 
   1191    for (c = 0; i->defExists(c); ++c) {
   1192       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
   1193       for (l = 0; l < 4; ++l)
   1194          u->setSrc(l, def[c][l]);
   1195    }
   1196 
   1197    i->bb->remove(i);
   1198    return true;
   1199 }
   1200 
   1201 bool
   1202 NVC0LoweringPass::handleTXD(TexInstruction *txd)
   1203 {
   1204    int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
   1205    unsigned arg = txd->tex.target.getArgCount();
   1206    unsigned expected_args = arg;
   1207    const int chipset = prog->getTarget()->getChipset();
   1208 
   1209    if (chipset >= NVISA_GK104_CHIPSET) {
   1210       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
   1211          expected_args++;
   1212       if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
   1213          expected_args++;
   1214    } else {
   1215       if (txd->tex.useOffsets)
   1216          expected_args++;
   1217       if (!txd->tex.target.isArray() && (
   1218                 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
   1219          expected_args++;
   1220    }
   1221 
   1222    if (expected_args > 4 ||
   1223        dim > 2 ||
   1224        txd->tex.target.isShadow())
   1225       txd->op = OP_TEX;
   1226 
   1227    handleTEX(txd);
   1228    while (txd->srcExists(arg))
   1229       ++arg;
   1230 
   1231    txd->tex.derivAll = true;
   1232    if (txd->op == OP_TEX)
   1233       return handleManualTXD(txd);
   1234 
   1235    assert(arg == expected_args);
   1236    for (int c = 0; c < dim; ++c) {
   1237       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
   1238       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
   1239       txd->dPdx[c].set(NULL);
   1240       txd->dPdy[c].set(NULL);
   1241    }
   1242 
   1243    // In this case we have fewer than 4 "real" arguments, which means that
   1244    // handleTEX didn't apply any padding. However we have to make sure that
   1245    // the second "group" of arguments still gets padded up to 4.
   1246    if (chipset >= NVISA_GK104_CHIPSET) {
   1247       int s = arg + 2 * dim;
   1248       if (s >= 4 && s < 7) {
   1249          if (txd->srcExists(s)) // move potential predicate out of the way
   1250             txd->moveSources(s, 7 - s);
   1251          while (s < 7)
   1252             txd->setSrc(s++, bld.loadImm(NULL, 0));
   1253       }
   1254    }
   1255 
   1256    return true;
   1257 }
   1258 
   1259 bool
   1260 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
   1261 {
   1262    const int chipset = prog->getTarget()->getChipset();
   1263    if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
   1264       txq->tex.r += prog->driver->io.texBindBase / 4;
   1265 
   1266    if (txq->tex.rIndirectSrc < 0)
   1267       return true;
   1268 
   1269    Value *ticRel = txq->getIndirectR();
   1270 
   1271    txq->setIndirectS(NULL);
   1272    txq->tex.sIndirectSrc = -1;
   1273 
   1274    assert(ticRel);
   1275 
   1276    if (chipset < NVISA_GK104_CHIPSET) {
   1277       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
   1278 
   1279       txq->setSrc(txq->tex.rIndirectSrc, NULL);
   1280       if (txq->tex.r)
   1281          ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
   1282                              ticRel, bld.mkImm(txq->tex.r));
   1283 
   1284       bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
   1285 
   1286       txq->moveSources(0, 1);
   1287       txq->setSrc(0, src);
   1288    } else {
   1289       Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
   1290       txq->tex.r = 0xff;
   1291       txq->tex.s = 0x1f;
   1292 
   1293       txq->setIndirectR(NULL);
   1294       txq->moveSources(0, 1);
   1295       txq->setSrc(0, hnd);
   1296       txq->tex.rIndirectSrc = 0;
   1297    }
   1298 
   1299    return true;
   1300 }
   1301 
   1302 bool
   1303 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
   1304 {
   1305    /* The outputs are inverted compared to what the TGSI instruction
   1306     * expects. Take that into account in the mask.
   1307     */
   1308    assert((i->tex.mask & ~3) == 0);
   1309    if (i->tex.mask == 1)
   1310       i->tex.mask = 2;
   1311    else if (i->tex.mask == 2)
   1312       i->tex.mask = 1;
   1313    handleTEX(i);
   1314    bld.setPosition(i, true);
   1315 
   1316    /* The returned values are not quite what we want:
   1317     * (a) convert from s16/u16 to f32
   1318     * (b) multiply by 1/256
   1319     */
   1320    for (int def = 0; def < 2; ++def) {
   1321       if (!i->defExists(def))
   1322          continue;
   1323       enum DataType type = TYPE_S16;
   1324       if (i->tex.mask == 2 || def > 0)
   1325          type = TYPE_U16;
   1326       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
   1327       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
   1328                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
   1329    }
   1330    if (i->tex.mask == 3) {
   1331       LValue *t = new_LValue(func, FILE_GPR);
   1332       bld.mkMov(t, i->getDef(0));
   1333       bld.mkMov(i->getDef(0), i->getDef(1));
   1334       bld.mkMov(i->getDef(1), t);
   1335    }
   1336    return true;
   1337 }
   1338 
   1339 bool
   1340 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
   1341 {
   1342    bufq->op = OP_MOV;
   1343    bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
   1344                                    bufq->getSrc(0)->reg.fileIndex * 16));
   1345    bufq->setIndirect(0, 0, NULL);
   1346    bufq->setIndirect(0, 1, NULL);
   1347    return true;
   1348 }
   1349 
   1350 void
   1351 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
   1352 {
   1353    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
   1354 
   1355    BasicBlock *currBB = atom->bb;
   1356    BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
   1357    BasicBlock *joinBB = atom->bb->splitAfter(atom);
   1358    BasicBlock *setAndUnlockBB = new BasicBlock(func);
   1359    BasicBlock *failLockBB = new BasicBlock(func);
   1360 
   1361    bld.setPosition(currBB, true);
   1362    assert(!currBB->joinAt);
   1363    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
   1364 
   1365    CmpInstruction *pred =
   1366       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   1367                 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
   1368 
   1369    bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
   1370    currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
   1371 
   1372    bld.setPosition(tryLockBB, true);
   1373 
   1374    Instruction *ld =
   1375       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
   1376                  atom->getIndirect(0, 0));
   1377    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
   1378    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
   1379 
   1380    bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
   1381    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
   1382    tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
   1383    tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
   1384 
   1385    tryLockBB->cfg.detach(&joinBB->cfg);
   1386    bld.remove(atom);
   1387 
   1388    bld.setPosition(setAndUnlockBB, true);
   1389    Value *stVal;
   1390    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
   1391       // Read the old value, and write the new one.
   1392       stVal = atom->getSrc(1);
   1393    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
   1394       CmpInstruction *set =
   1395          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
   1396                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
   1397 
   1398       bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
   1399                 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
   1400    } else {
   1401       operation op;
   1402 
   1403       switch (atom->subOp) {
   1404       case NV50_IR_SUBOP_ATOM_ADD:
   1405          op = OP_ADD;
   1406          break;
   1407       case NV50_IR_SUBOP_ATOM_AND:
   1408          op = OP_AND;
   1409          break;
   1410       case NV50_IR_SUBOP_ATOM_OR:
   1411          op = OP_OR;
   1412          break;
   1413       case NV50_IR_SUBOP_ATOM_XOR:
   1414          op = OP_XOR;
   1415          break;
   1416       case NV50_IR_SUBOP_ATOM_MIN:
   1417          op = OP_MIN;
   1418          break;
   1419       case NV50_IR_SUBOP_ATOM_MAX:
   1420          op = OP_MAX;
   1421          break;
   1422       default:
   1423          assert(0);
   1424          return;
   1425       }
   1426 
   1427       stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
   1428                          atom->getSrc(1));
   1429    }
   1430 
   1431    Instruction *st =
   1432       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
   1433                   atom->getIndirect(0, 0), stVal);
   1434    st->setDef(0, pred->getDef(0));
   1435    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
   1436 
   1437    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
   1438    setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
   1439 
   1440    // Lock until the store has not been performed.
   1441    bld.setPosition(failLockBB, true);
   1442    bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
   1443    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
   1444    failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
   1445    failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
   1446 
   1447    bld.setPosition(joinBB, false);
   1448    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
   1449 }
   1450 
   1451 void
   1452 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
   1453 {
   1454    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
   1455 
   1456    BasicBlock *currBB = atom->bb;
   1457    BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
   1458    BasicBlock *joinBB = atom->bb->splitAfter(atom);
   1459 
   1460    bld.setPosition(currBB, true);
   1461    assert(!currBB->joinAt);
   1462    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
   1463 
   1464    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
   1465    currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
   1466 
   1467    bld.setPosition(tryLockAndSetBB, true);
   1468 
   1469    Instruction *ld =
   1470       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
   1471                  atom->getIndirect(0, 0));
   1472    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
   1473    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
   1474 
   1475    Value *stVal;
   1476    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
   1477       // Read the old value, and write the new one.
   1478       stVal = atom->getSrc(1);
   1479    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
   1480       CmpInstruction *set =
   1481          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   1482                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
   1483       set->setPredicate(CC_P, ld->getDef(1));
   1484 
   1485       Instruction *selp =
   1486          bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
   1487                    atom->getSrc(2), set->getDef(0));
   1488       selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
   1489       selp->setPredicate(CC_P, ld->getDef(1));
   1490 
   1491       stVal = selp->getDef(0);
   1492    } else {
   1493       operation op;
   1494 
   1495       switch (atom->subOp) {
   1496       case NV50_IR_SUBOP_ATOM_ADD:
   1497          op = OP_ADD;
   1498          break;
   1499       case NV50_IR_SUBOP_ATOM_AND:
   1500          op = OP_AND;
   1501          break;
   1502       case NV50_IR_SUBOP_ATOM_OR:
   1503          op = OP_OR;
   1504          break;
   1505       case NV50_IR_SUBOP_ATOM_XOR:
   1506          op = OP_XOR;
   1507          break;
   1508       case NV50_IR_SUBOP_ATOM_MIN:
   1509          op = OP_MIN;
   1510          break;
   1511       case NV50_IR_SUBOP_ATOM_MAX:
   1512          op = OP_MAX;
   1513          break;
   1514       default:
   1515          assert(0);
   1516          return;
   1517       }
   1518 
   1519       Instruction *i =
   1520          bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
   1521                    atom->getSrc(1));
   1522       i->setPredicate(CC_P, ld->getDef(1));
   1523 
   1524       stVal = i->getDef(0);
   1525    }
   1526 
   1527    Instruction *st =
   1528       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
   1529                   atom->getIndirect(0, 0), stVal);
   1530    st->setPredicate(CC_P, ld->getDef(1));
   1531    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
   1532 
   1533    // Loop until the lock is acquired.
   1534    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
   1535    tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
   1536    tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
   1537    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
   1538 
   1539    bld.remove(atom);
   1540 
   1541    bld.setPosition(joinBB, false);
   1542    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
   1543 }
   1544 
   1545 bool
   1546 NVC0LoweringPass::handleATOM(Instruction *atom)
   1547 {
   1548    SVSemantic sv;
   1549    Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
   1550 
   1551    switch (atom->src(0).getFile()) {
   1552    case FILE_MEMORY_LOCAL:
   1553       sv = SV_LBASE;
   1554       break;
   1555    case FILE_MEMORY_SHARED:
   1556       // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
   1557       // operations on shared memory. For Maxwell, ATOMS is enough.
   1558       if (targ->getChipset() < NVISA_GK104_CHIPSET)
   1559          handleSharedATOM(atom);
   1560       else if (targ->getChipset() < NVISA_GM107_CHIPSET)
   1561          handleSharedATOMNVE4(atom);
   1562       return true;
   1563    default:
   1564       assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
   1565       base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
   1566       assert(base->reg.size == 8);
   1567       if (ptr)
   1568          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
   1569       assert(base->reg.size == 8);
   1570       atom->setIndirect(0, 0, base);
   1571       atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   1572 
   1573       // Harden against out-of-bounds accesses
   1574       Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
   1575       Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
   1576       Value *pred = new_LValue(func, FILE_PREDICATE);
   1577       if (ptr)
   1578          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
   1579       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
   1580       atom->setPredicate(CC_NOT_P, pred);
   1581       if (atom->defExists(0)) {
   1582          Value *zero, *dst = atom->getDef(0);
   1583          atom->setDef(0, bld.getSSA());
   1584 
   1585          bld.setPosition(atom, true);
   1586          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
   1587             ->setPredicate(CC_P, pred);
   1588          bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
   1589       }
   1590 
   1591       return true;
   1592    }
   1593    base =
   1594       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
   1595 
   1596    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
   1597    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   1598    if (ptr)
   1599       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
   1600    atom->setIndirect(0, 1, NULL);
   1601    atom->setIndirect(0, 0, base);
   1602 
   1603    return true;
   1604 }
   1605 
   1606 bool
   1607 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
   1608 {
   1609    if (targ->getChipset() < NVISA_GM107_CHIPSET) {
   1610       if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
   1611          // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
   1612          return false;
   1613       }
   1614    }
   1615 
   1616    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
   1617        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
   1618       return false;
   1619    bld.setPosition(cas, true);
   1620 
   1621    if (needCctl) {
   1622       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
   1623       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
   1624       cctl->fixed = 1;
   1625       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
   1626       if (cas->isPredicated())
   1627          cctl->setPredicate(cas->cc, cas->getPredicate());
   1628    }
   1629 
   1630    if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
   1631       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
   1632       // should be set to the high part of the double reg or bad things will
   1633       // happen elsewhere in the universe.
   1634       // Also, it sometimes returns the new value instead of the old one
   1635       // under mysterious circumstances.
   1636       Value *dreg = bld.getSSA(8);
   1637       bld.setPosition(cas, false);
   1638       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
   1639       cas->setSrc(1, dreg);
   1640       cas->setSrc(2, dreg);
   1641    }
   1642 
   1643    return true;
   1644 }
   1645 
   1646 inline Value *
   1647 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
   1648 {
   1649    uint8_t b = prog->driver->io.auxCBSlot;
   1650    off += base;
   1651 
   1652    return bld.
   1653       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
   1654 }
   1655 
   1656 inline Value *
   1657 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
   1658 {
   1659    uint8_t b = prog->driver->io.auxCBSlot;
   1660    off += base;
   1661 
   1662    if (ptr)
   1663       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
   1664 
   1665    return bld.
   1666       mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
   1667 }
   1668 
   1669 inline Value *
   1670 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
   1671 {
   1672    uint8_t b = prog->driver->io.auxCBSlot;
   1673    off += base;
   1674 
   1675    if (ptr)
   1676       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
   1677 
   1678    return bld.
   1679       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
   1680 }
   1681 
   1682 inline Value *
   1683 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
   1684 {
   1685    return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
   1686 }
   1687 
   1688 inline Value *
   1689 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
   1690 {
   1691    return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
   1692 }
   1693 
   1694 inline Value *
   1695 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
   1696 {
   1697    return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
   1698 }
   1699 
   1700 inline Value *
   1701 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
   1702 {
   1703    return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
   1704 }
   1705 
   1706 inline Value *
   1707 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
   1708 {
   1709    uint8_t b = prog->driver->io.msInfoCBSlot;
   1710    off += prog->driver->io.msInfoBase;
   1711    return bld.
   1712       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
   1713 }
   1714 
   1715 /* On nvc0, surface info is obtained via the surface binding points passed
   1716  * to the SULD/SUST instructions.
   1717  * On nve4, surface info is stored in c[] and is used by various special
   1718  * instructions, e.g. for clamping coordinates or generating an address.
   1719  * They couldn't just have added an equivalent to TIC now, couldn't they ?
   1720  */
   1721 #define NVC0_SU_INFO_ADDR   0x00
   1722 #define NVC0_SU_INFO_FMT    0x04
   1723 #define NVC0_SU_INFO_DIM_X  0x08
   1724 #define NVC0_SU_INFO_PITCH  0x0c
   1725 #define NVC0_SU_INFO_DIM_Y  0x10
   1726 #define NVC0_SU_INFO_ARRAY  0x14
   1727 #define NVC0_SU_INFO_DIM_Z  0x18
   1728 #define NVC0_SU_INFO_UNK1C  0x1c
   1729 #define NVC0_SU_INFO_WIDTH  0x20
   1730 #define NVC0_SU_INFO_HEIGHT 0x24
   1731 #define NVC0_SU_INFO_DEPTH  0x28
   1732 #define NVC0_SU_INFO_TARGET 0x2c
   1733 #define NVC0_SU_INFO_BSIZE  0x30
   1734 #define NVC0_SU_INFO_RAW_X  0x34
   1735 #define NVC0_SU_INFO_MS_X   0x38
   1736 #define NVC0_SU_INFO_MS_Y   0x3c
   1737 
   1738 #define NVC0_SU_INFO__STRIDE 0x40
   1739 
   1740 #define NVC0_SU_INFO_DIM(i)  (0x08 + (i) * 8)
   1741 #define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
   1742 #define NVC0_SU_INFO_MS(i)   (0x38 + (i) * 4)
   1743 
   1744 inline Value *
   1745 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
   1746 {
   1747    uint32_t base = slot * NVC0_SU_INFO__STRIDE;
   1748 
   1749    if (ptr) {
   1750       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
   1751       if (bindless)
   1752          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));
   1753       else
   1754          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
   1755       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
   1756       base = 0;
   1757    }
   1758    off += base;
   1759 
   1760    return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :
   1761                         prog->driver->io.suInfoBase);
   1762 }
   1763 
   1764 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
   1765 {
   1766    switch (su->tex.target.getEnum()) {
   1767    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
   1768    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1769    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1770    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
   1771                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
   1772                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1773    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
   1774    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
   1775    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1776    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1777    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1778    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1779    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
   1780    default:
   1781       assert(0);
   1782       return 0;
   1783    }
   1784 }
   1785 
   1786 bool
   1787 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
   1788 {
   1789    int mask = suq->tex.mask;
   1790    int dim = suq->tex.target.getDim();
   1791    int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
   1792    Value *ind = suq->getIndirectR();
   1793    int slot = suq->tex.r;
   1794    int c, d;
   1795 
   1796    for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
   1797       if (c >= arg || !(mask & 1))
   1798          continue;
   1799 
   1800       int offset;
   1801 
   1802       if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
   1803          offset = NVC0_SU_INFO_SIZE(2);
   1804       } else {
   1805          offset = NVC0_SU_INFO_SIZE(c);
   1806       }
   1807       bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));
   1808       if (c == 2 && suq->tex.target.isCube())
   1809          bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
   1810                    bld.loadImm(NULL, 6));
   1811    }
   1812 
   1813    if (mask & 1) {
   1814       if (suq->tex.target.isMS()) {
   1815          Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);
   1816          Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);
   1817          Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
   1818          bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
   1819       } else {
   1820          bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
   1821       }
   1822    }
   1823 
   1824    bld.remove(suq);
   1825    return true;
   1826 }
   1827 
   1828 void
   1829 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
   1830 {
   1831    const int arg = tex->tex.target.getArgCount();
   1832    int slot = tex->tex.r;
   1833 
   1834    if (tex->tex.target == TEX_TARGET_2D_MS)
   1835       tex->tex.target = TEX_TARGET_2D;
   1836    else
   1837    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
   1838       tex->tex.target = TEX_TARGET_2D_ARRAY;
   1839    else
   1840       return;
   1841 
   1842    Value *x = tex->getSrc(0);
   1843    Value *y = tex->getSrc(1);
   1844    Value *s = tex->getSrc(arg - 1);
   1845 
   1846    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
   1847    Value *ind = tex->getIndirectR();
   1848 
   1849    Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), tex->tex.bindless);
   1850    Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), tex->tex.bindless);
   1851 
   1852    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
   1853    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
   1854 
   1855    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
   1856    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
   1857 
   1858    Value *dx = loadMsInfo32(ts, 0x0);
   1859    Value *dy = loadMsInfo32(ts, 0x4);
   1860 
   1861    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
   1862    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
   1863 
   1864    tex->setSrc(0, tx);
   1865    tex->setSrc(1, ty);
   1866    tex->moveSources(arg, -1);
   1867 }
   1868 
   1869 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
   1870 // They're computed from the coordinates using the surface info in c[] space.
   1871 void
   1872 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
   1873 {
   1874    Instruction *insn;
   1875    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
   1876    const bool raw =
   1877       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
   1878    const int slot = su->tex.r;
   1879    const int dim = su->tex.target.getDim();
   1880    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   1881    int c;
   1882    Value *zero = bld.mkImm(0);
   1883    Value *p1 = NULL;
   1884    Value *v;
   1885    Value *src[3];
   1886    Value *bf, *eau, *off;
   1887    Value *addr, *pred;
   1888    Value *ind = su->getIndirectR();
   1889 
   1890    off = bld.getScratch(4);
   1891    bf = bld.getScratch(4);
   1892    addr = bld.getSSA(8);
   1893    pred = bld.getScratch(1, FILE_PREDICATE);
   1894 
   1895    bld.setPosition(su, false);
   1896 
   1897    adjustCoordinatesMS(su);
   1898 
   1899    // calculate clamped coordinates
   1900    for (c = 0; c < arg; ++c) {
   1901       int dimc = c;
   1902 
   1903       if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
   1904          // The array index is stored in the Z component for 1D arrays.
   1905          dimc = 2;
   1906       }
   1907 
   1908       src[c] = bld.getScratch();
   1909       if (c == 0 && raw)
   1910          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);
   1911       else
   1912          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);
   1913       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
   1914          ->subOp = getSuClampSubOp(su, dimc);
   1915    }
   1916    for (; c < 3; ++c)
   1917       src[c] = zero;
   1918 
   1919    // set predicate output
   1920    if (su->tex.target == TEX_TARGET_BUFFER) {
   1921       src[0]->getInsn()->setFlagsDef(1, pred);
   1922    } else
   1923    if (su->tex.target.isArray() || su->tex.target.isCube()) {
   1924       p1 = bld.getSSA(1, FILE_PREDICATE);
   1925       src[dim]->getInsn()->setFlagsDef(1, p1);
   1926    }
   1927 
   1928    // calculate pixel offset
   1929    if (dim == 1) {
   1930       if (su->tex.target != TEX_TARGET_BUFFER)
   1931          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
   1932    } else
   1933    if (dim == 3) {
   1934       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
   1935       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
   1936          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
   1937 
   1938       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
   1939       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
   1940          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
   1941    } else {
   1942       assert(dim == 2);
   1943       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
   1944       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
   1945          ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
   1946          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
   1947    }
   1948 
   1949    // calculate effective address part 1
   1950    if (su->tex.target == TEX_TARGET_BUFFER) {
   1951       if (raw) {
   1952          bf = src[0];
   1953       } else {
   1954          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
   1955          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
   1956             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
   1957       }
   1958    } else {
   1959       Value *y = src[1];
   1960       Value *z = src[2];
   1961       uint16_t subOp = 0;
   1962 
   1963       switch (dim) {
   1964       case 1:
   1965          y = zero;
   1966          z = zero;
   1967          break;
   1968       case 2:
   1969          z = off;
   1970          if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
   1971             z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
   1972             subOp = NV50_IR_SUBOP_SUBFM_3D;
   1973          }
   1974          break;
   1975       default:
   1976          subOp = NV50_IR_SUBOP_SUBFM_3D;
   1977          assert(dim == 3);
   1978          break;
   1979       }
   1980       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
   1981       insn->subOp = subOp;
   1982       insn->setFlagsDef(1, pred);
   1983    }
   1984 
   1985    // part 2
   1986    v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);
   1987 
   1988    if (su->tex.target == TEX_TARGET_BUFFER) {
   1989       eau = v;
   1990    } else {
   1991       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
   1992    }
   1993    // add array layer offset
   1994    if (su->tex.target.isArray() || su->tex.target.isCube()) {
   1995       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
   1996       if (dim == 1)
   1997          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
   1998             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
   1999       else
   2000          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
   2001             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
   2002       // combine predicates
   2003       assert(p1);
   2004       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
   2005    }
   2006 
   2007    if (atom) {
   2008       Value *lo = bf;
   2009       if (su->tex.target == TEX_TARGET_BUFFER) {
   2010          lo = zero;
   2011          bld.mkMov(off, bf);
   2012       }
   2013       //  bf == g[] address & 0xff
   2014       // eau == g[] address >> 8
   2015       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
   2016       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
   2017    } else
   2018    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
   2019       // Convert from u32 to u8 address format, which is what the library code
   2020       // doing SULDP currently uses.
   2021       // XXX: can SUEAU do this ?
   2022       // XXX: does it matter that we don't mask high bytes in bf ?
   2023       // Grrr.
   2024       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
   2025       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
   2026    }
   2027 
   2028    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
   2029 
   2030    if (atom && su->tex.target == TEX_TARGET_BUFFER)
   2031       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
   2032 
   2033    // let's just set it 0 for raw access and hope it works
   2034    v = raw ?
   2035       bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
   2036 
   2037    // get rid of old coordinate sources, make space for fmt info and predicate
   2038    su->moveSources(arg, 3 - arg);
   2039    // set 64 bit address and 32-bit format sources
   2040    su->setSrc(0, addr);
   2041    su->setSrc(1, v);
   2042    su->setSrc(2, pred);
   2043    su->setIndirectR(NULL);
   2044 
   2045    // prevent read fault when the image is not actually bound
   2046    CmpInstruction *pred1 =
   2047       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   2048                 TYPE_U32, bld.mkImm(0),
   2049                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
   2050 
   2051    if (su->op != OP_SUSTP && su->tex.format) {
   2052       const TexInstruction::ImgFormatDesc *format = su->tex.format;
   2053       int blockwidth = format->bits[0] + format->bits[1] +
   2054                        format->bits[2] + format->bits[3];
   2055 
   2056       // make sure that the format doesn't mismatch
   2057       assert(format->components != 0);
   2058       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
   2059                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
   2060                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
   2061                 pred1->getDef(0));
   2062    }
   2063    su->setPredicate(CC_NOT_P, pred1->getDef(0));
   2064 
   2065    // TODO: initialize def values to 0 when the surface operation is not
   2066    // performed (not needed for stores). Also, fix the "address bounds test"
   2067    // subtests from arb_shader_image_load_store-invalid for buffers, because it
   2068    // seems like that the predicate is not correctly set by suclamp.
   2069 }
   2070 
   2071 static DataType
   2072 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
   2073 {
   2074    switch (t->type) {
   2075    case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
   2076    case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
   2077    case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
   2078    case UINT:
   2079       return (t->bits[c] == 8 ? TYPE_U8 :
   2080               (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
   2081    case SINT:
   2082       return (t->bits[c] == 8 ? TYPE_S8 :
   2083               (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
   2084    }
   2085    return TYPE_NONE;
   2086 }
   2087 
   2088 static DataType
   2089 getDestType(const ImgType type) {
   2090    switch (type) {
   2091    case FLOAT:
   2092    case UNORM:
   2093    case SNORM:
   2094       return TYPE_F32;
   2095    case UINT:
   2096       return TYPE_U32;
   2097    case SINT:
   2098       return TYPE_S32;
   2099    default:
   2100       assert(!"Impossible type");
   2101       return TYPE_NONE;
   2102    }
   2103 }
   2104 
   2105 void
   2106 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
   2107 {
   2108    const TexInstruction::ImgFormatDesc *format = su->tex.format;
   2109    int width = format->bits[0] + format->bits[1] +
   2110       format->bits[2] + format->bits[3];
   2111    Value *untypedDst[4] = {};
   2112    Value *typedDst[4] = {};
   2113 
   2114    // We must convert this to a generic load.
   2115    su->op = OP_SULDB;
   2116 
   2117    su->dType = typeOfSize(width / 8);
   2118    su->sType = TYPE_U8;
   2119 
   2120    for (int i = 0; i < width / 32; i++)
   2121       untypedDst[i] = bld.getSSA();
   2122    if (width < 32)
   2123       untypedDst[0] = bld.getSSA();
   2124 
   2125    for (int i = 0; i < 4; i++) {
   2126       typedDst[i] = su->getDef(i);
   2127    }
   2128 
   2129    // Set the untyped dsts as the su's destinations
   2130    for (int i = 0; i < 4; i++)
   2131       su->setDef(i, untypedDst[i]);
   2132 
   2133    bld.setPosition(su, true);
   2134 
   2135    // Unpack each component into the typed dsts
   2136    int bits = 0;
   2137    for (int i = 0; i < 4; bits += format->bits[i], i++) {
   2138       if (!typedDst[i])
   2139          continue;
   2140       if (i >= format->components) {
   2141          if (format->type == FLOAT ||
   2142              format->type == UNORM ||
   2143              format->type == SNORM)
   2144             bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
   2145          else
   2146             bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
   2147          continue;
   2148       }
   2149 
   2150       // Get just that component's data into the relevant place
   2151       if (format->bits[i] == 32)
   2152          bld.mkMov(typedDst[i], untypedDst[i]);
   2153       else if (format->bits[i] == 16)
   2154          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
   2155                    getSrcType(format, i), untypedDst[i / 2])
   2156          ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
   2157       else if (format->bits[i] == 8)
   2158          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
   2159                    getSrcType(format, i), untypedDst[0])->subOp = i;
   2160       else {
   2161          bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
   2162                    bld.mkImm((bits % 32) | (format->bits[i] << 8)));
   2163          if (format->type == UNORM || format->type == SNORM)
   2164             bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
   2165       }
   2166 
   2167       // Normalize / convert as necessary
   2168       if (format->type == UNORM)
   2169          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
   2170       else if (format->type == SNORM)
   2171          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
   2172       else if (format->type == FLOAT && format->bits[i] < 16) {
   2173          bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
   2174          bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
   2175       }
   2176    }
   2177 
   2178    if (format->bgra) {
   2179       std::swap(typedDst[0], typedDst[2]);
   2180    }
   2181 }
   2182 
   2183 void
   2184 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
   2185 {
   2186    processSurfaceCoordsNVE4(su);
   2187 
   2188    if (su->op == OP_SULDP)
   2189       convertSurfaceFormat(su);
   2190 
   2191    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
   2192       assert(su->getPredicate());
   2193       Value *pred =
   2194          bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
   2195                     su->getPredicate(), su->getSrc(2));
   2196 
   2197       Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
   2198       red->subOp = su->subOp;
   2199       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
   2200       red->setSrc(1, su->getSrc(3));
   2201       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
   2202          red->setSrc(2, su->getSrc(4));
   2203       red->setIndirect(0, 0, su->getSrc(0));
   2204 
   2205       // make sure to initialize dst value when the atomic operation is not
   2206       // performed
   2207       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
   2208 
   2209       assert(su->cc == CC_NOT_P);
   2210       red->setPredicate(su->cc, pred);
   2211       mov->setPredicate(CC_P, pred);
   2212 
   2213       bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
   2214                 red->getDef(0), mov->getDef(0));
   2215 
   2216       delete_Instruction(bld.getProgram(), su);
   2217       handleCasExch(red, true);
   2218    }
   2219 
   2220    if (su->op == OP_SUSTB || su->op == OP_SUSTP)
   2221       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
   2222 }
   2223 
   2224 void
   2225 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
   2226 {
   2227    const int slot = su->tex.r;
   2228    const int dim = su->tex.target.getDim();
   2229    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   2230    int c;
   2231    Value *zero = bld.mkImm(0);
   2232    Value *src[3];
   2233    Value *v;
   2234    Value *ind = su->getIndirectR();
   2235 
   2236    bld.setPosition(su, false);
   2237 
   2238    adjustCoordinatesMS(su);
   2239 
   2240    if (ind) {
   2241       Value *ptr;
   2242       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
   2243       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
   2244       su->setIndirectR(ptr);
   2245    }
   2246 
   2247    // get surface coordinates
   2248    for (c = 0; c < arg; ++c)
   2249       src[c] = su->getSrc(c);
   2250    for (; c < 3; ++c)
   2251       src[c] = zero;
   2252 
   2253    // calculate pixel offset
   2254    if (su->op == OP_SULDP || su->op == OP_SUREDP) {
   2255       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);
   2256       su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
   2257    }
   2258 
   2259    // add array layer offset
   2260    if (su->tex.target.isArray() || su->tex.target.isCube()) {
   2261       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
   2262       assert(dim > 1);
   2263       su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
   2264    }
   2265 
   2266    // prevent read fault when the image is not actually bound
   2267    CmpInstruction *pred =
   2268       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   2269                 TYPE_U32, bld.mkImm(0),
   2270                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
   2271    if (su->op != OP_SUSTP && su->tex.format) {
   2272       const TexInstruction::ImgFormatDesc *format = su->tex.format;
   2273       int blockwidth = format->bits[0] + format->bits[1] +
   2274                        format->bits[2] + format->bits[3];
   2275 
   2276       assert(format->components != 0);
   2277       // make sure that the format doesn't mismatch when it's not FMT_NONE
   2278       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
   2279                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
   2280                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
   2281                 pred->getDef(0));
   2282    }
   2283    su->setPredicate(CC_NOT_P, pred->getDef(0));
   2284 }
   2285 
   2286 void
   2287 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
   2288 {
   2289    if (su->tex.target == TEX_TARGET_1D_ARRAY) {
   2290       /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
   2291        * will simplify the lowering pass and the texture constraints. */
   2292       su->moveSources(1, 1);
   2293       su->setSrc(1, bld.loadImm(NULL, 0));
   2294       su->tex.target = TEX_TARGET_2D_ARRAY;
   2295    }
   2296 
   2297    processSurfaceCoordsNVC0(su);
   2298 
   2299    if (su->op == OP_SULDP)
   2300       convertSurfaceFormat(su);
   2301 
   2302    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
   2303       const int dim = su->tex.target.getDim();
   2304       const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   2305       LValue *addr = bld.getSSA(8);
   2306       Value *def = su->getDef(0);
   2307 
   2308       su->op = OP_SULEA;
   2309 
   2310       // Set the destination to the address
   2311       su->dType = TYPE_U64;
   2312       su->setDef(0, addr);
   2313       su->setDef(1, su->getPredicate());
   2314 
   2315       bld.setPosition(su, true);
   2316 
   2317       // Perform the atomic op
   2318       Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
   2319       red->subOp = su->subOp;
   2320       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
   2321       red->setSrc(1, su->getSrc(arg));
   2322       if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
   2323          red->setSrc(2, su->getSrc(arg + 1));
   2324       red->setIndirect(0, 0, addr);
   2325 
   2326       // make sure to initialize dst value when the atomic operation is not
   2327       // performed
   2328       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
   2329 
   2330       assert(su->cc == CC_NOT_P);
   2331       red->setPredicate(su->cc, su->getPredicate());
   2332       mov->setPredicate(CC_P, su->getPredicate());
   2333 
   2334       bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
   2335 
   2336       handleCasExch(red, false);
   2337    }
   2338 }
   2339 
   2340 void
   2341 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
   2342 {
   2343    const int slot = su->tex.r;
   2344    const int dim = su->tex.target.getDim();
   2345    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
   2346    Value *ind = su->getIndirectR();
   2347    int pos = 0;
   2348 
   2349    bld.setPosition(su, false);
   2350 
   2351    // add texture handle
   2352    switch (su->op) {
   2353    case OP_SUSTP:
   2354       pos = 4;
   2355       break;
   2356    case OP_SUREDP:
   2357       pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
   2358       break;
   2359    default:
   2360       assert(pos == 0);
   2361       break;
   2362    }
   2363    su->setSrc(arg + pos, loadTexHandle(ind, slot + 32));
   2364 
   2365    // prevent read fault when the image is not actually bound
   2366    CmpInstruction *pred =
   2367       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
   2368                 TYPE_U32, bld.mkImm(0),
   2369                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
   2370    if (su->op != OP_SUSTP && su->tex.format) {
   2371       const TexInstruction::ImgFormatDesc *format = su->tex.format;
   2372       int blockwidth = format->bits[0] + format->bits[1] +
   2373                        format->bits[2] + format->bits[3];
   2374 
   2375       assert(format->components != 0);
   2376       // make sure that the format doesn't mismatch when it's not FMT_NONE
   2377       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
   2378                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
   2379                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
   2380                 pred->getDef(0));
   2381    }
   2382    su->setPredicate(CC_NOT_P, pred->getDef(0));
   2383 }
   2384 
   2385 void
   2386 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
   2387 {
   2388    processSurfaceCoordsGM107(su);
   2389 
   2390    if (su->op == OP_SULDP)
   2391       convertSurfaceFormat(su);
   2392 
   2393    if (su->op == OP_SUREDP) {
   2394       Value *def = su->getDef(0);
   2395 
   2396       su->op = OP_SUREDB;
   2397       su->setDef(0, bld.getSSA());
   2398 
   2399       bld.setPosition(su, true);
   2400 
   2401       // make sure to initialize dst value when the atomic operation is not
   2402       // performed
   2403       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
   2404 
   2405       assert(su->cc == CC_NOT_P);
   2406       mov->setPredicate(CC_P, su->getPredicate());
   2407 
   2408       bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
   2409    }
   2410 }
   2411 
   2412 bool
   2413 NVC0LoweringPass::handleWRSV(Instruction *i)
   2414 {
   2415    Instruction *st;
   2416    Symbol *sym;
   2417    uint32_t addr;
   2418 
   2419    // must replace, $sreg are not writeable
   2420    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
   2421    if (addr >= 0x400)
   2422       return false;
   2423    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
   2424 
   2425    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
   2426                     i->getSrc(1));
   2427    st->perPatch = i->perPatch;
   2428 
   2429    bld.getBB()->remove(i);
   2430    return true;
   2431 }
   2432 
   2433 void
   2434 NVC0LoweringPass::handleLDST(Instruction *i)
   2435 {
   2436    if (i->src(0).getFile() == FILE_SHADER_INPUT) {
   2437       if (prog->getType() == Program::TYPE_COMPUTE) {
   2438          i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
   2439          i->getSrc(0)->reg.fileIndex = 0;
   2440       } else
   2441       if (prog->getType() == Program::TYPE_GEOMETRY &&
   2442           i->src(0).isIndirect(0)) {
   2443          // XXX: this assumes vec4 units
   2444          Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
   2445                                  i->getIndirect(0, 0), bld.mkImm(4));
   2446          i->setIndirect(0, 0, ptr);
   2447          i->op = OP_VFETCH;
   2448       } else {
   2449          i->op = OP_VFETCH;
   2450          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
   2451       }
   2452    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
   2453       if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
   2454           prog->getType() == Program::TYPE_COMPUTE) {
   2455          // The launch descriptor only allows to set up 8 CBs, but OpenGL
   2456          // requires at least 12 UBOs. To bypass this limitation, we store the
   2457          // addrs into the driver constbuf and we directly load from the global
   2458          // memory.
   2459          int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
   2460          Value *ind = i->getIndirect(0, 1);
   2461 
   2462          if (!ind && fileIndex == -1)
   2463             return;
   2464 
   2465          if (ind) {
   2466             // Clamp the UBO index when an indirect access is used to avoid
   2467             // loading information from the wrong place in the driver cb.
   2468             // TODO - synchronize the max with the driver.
   2469             ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),
   2470                              bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
   2471                                         ind, bld.loadImm(NULL, fileIndex)),
   2472                              bld.loadImm(NULL, 13));
   2473             fileIndex = 0;
   2474          }
   2475 
   2476          Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
   2477          Value *ptr = loadUboInfo64(ind, fileIndex * 16);
   2478          Value *length = loadUboLength32(ind, fileIndex * 16);
   2479          Value *pred = new_LValue(func, FILE_PREDICATE);
   2480          if (i->src(0).isIndirect(0)) {
   2481             bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
   2482             bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
   2483          }
   2484          i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   2485          i->setIndirect(0, 1, NULL);
   2486          i->setIndirect(0, 0, ptr);
   2487          bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
   2488          i->setPredicate(CC_NOT_P, pred);
   2489          Value *zero, *dst = i->getDef(0);
   2490          i->setDef(0, bld.getSSA());
   2491 
   2492          bld.setPosition(i, true);
   2493          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
   2494             ->setPredicate(CC_P, pred);
   2495          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
   2496       } else if (i->src(0).isIndirect(1)) {
   2497          Value *ptr;
   2498          if (i->src(0).isIndirect(0))
   2499             ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
   2500                              i->getIndirect(0, 1), bld.mkImm(0x1010),
   2501                              i->getIndirect(0, 0));
   2502          else
   2503             ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
   2504                              i->getIndirect(0, 1), bld.mkImm(16));
   2505          i->setIndirect(0, 1, NULL);
   2506          i->setIndirect(0, 0, ptr);
   2507          i->subOp = NV50_IR_SUBOP_LDC_IS;
   2508       }
   2509    } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
   2510       assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
   2511       i->op = OP_VFETCH;
   2512    } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
   2513       Value *ind = i->getIndirect(0, 1);
   2514       Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
   2515       // XXX come up with a way not to do this for EVERY little access but
   2516       // rather to batch these up somehow. Unfortunately we've lost the
   2517       // information about the field width by the time we get here.
   2518       Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
   2519       Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
   2520       Value *pred = new_LValue(func, FILE_PREDICATE);
   2521       if (i->src(0).isIndirect(0)) {
   2522          bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
   2523          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
   2524       }
   2525       i->setIndirect(0, 1, NULL);
   2526       i->setIndirect(0, 0, ptr);
   2527       i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
   2528       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
   2529       i->setPredicate(CC_NOT_P, pred);
   2530       if (i->defExists(0)) {
   2531          Value *zero, *dst = i->getDef(0);
   2532          i->setDef(0, bld.getSSA());
   2533 
   2534          bld.setPosition(i, true);
   2535          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
   2536             ->setPredicate(CC_P, pred);
   2537          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
   2538       }
   2539    }
   2540 }
   2541 
   2542 void
   2543 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
   2544 {
   2545    Value *laneid = bld.getSSA();
   2546    Value *x, *y;
   2547 
   2548    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
   2549 
   2550    if (c == 0) {
   2551       x = dst;
   2552       y = NULL;
   2553    } else
   2554    if (c == 1) {
   2555       x = NULL;
   2556       y = dst;
   2557    } else {
   2558       assert(c == 2);
   2559       if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
   2560          bld.mkMov(dst, bld.loadImm(NULL, 0));
   2561          return;
   2562       }
   2563       x = bld.getSSA();
   2564       y = bld.getSSA();
   2565    }
   2566    if (x)
   2567       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
   2568    if (y)
   2569       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
   2570 
   2571    if (c == 2) {
   2572       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
   2573       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
   2574    }
   2575 }
   2576 
   2577 bool
   2578 NVC0LoweringPass::handleRDSV(Instruction *i)
   2579 {
   2580    Symbol *sym = i->getSrc(0)->asSym();
   2581    const SVSemantic sv = sym->reg.data.sv.sv;
   2582    Value *vtx = NULL;
   2583    Instruction *ld;
   2584    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
   2585 
   2586    if (addr >= 0x400) {
   2587       // mov $sreg
   2588       if (sym->reg.data.sv.index == 3) {
   2589          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
   2590          i->op = OP_MOV;
   2591          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
   2592       }
   2593       if (sv == SV_VERTEX_COUNT) {
   2594          bld.setPosition(i, true);
   2595          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
   2596       }
   2597       return true;
   2598    }
   2599 
   2600    switch (sv) {
   2601    case SV_POSITION:
   2602       assert(prog->getType() == Program::TYPE_FRAGMENT);
   2603       if (i->srcExists(1)) {
   2604          // Pass offset through to the interpolation logic
   2605          ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
   2606                            i->getDef(0), addr, NULL);
   2607          ld->setSrc(1, i->getSrc(1));
   2608       } else {
   2609          bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
   2610       }
   2611       break;
   2612    case SV_FACE:
   2613    {
   2614       Value *face = i->getDef(0);
   2615       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
   2616       if (i->dType == TYPE_F32) {
   2617          bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
   2618          bld.mkOp1(OP_NEG, TYPE_S32, face, face);
   2619          bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
   2620       }
   2621    }
   2622       break;
   2623    case SV_TESS_COORD:
   2624       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
   2625       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
   2626       break;
   2627    case SV_NTID:
   2628    case SV_NCTAID:
   2629    case SV_GRIDID:
   2630       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
   2631       if (sym->reg.data.sv.index == 3) {
   2632          i->op = OP_MOV;
   2633          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
   2634          return true;
   2635       }
   2636       // Fallthrough
   2637    case SV_WORK_DIM:
   2638       addr += prog->driver->prop.cp.gridInfoBase;
   2639       bld.mkLoad(TYPE_U32, i->getDef(0),
   2640                  bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
   2641                               TYPE_U32, addr), NULL);
   2642       break;
   2643    case SV_SAMPLE_INDEX:
   2644       // TODO: Properly pass source as an address in the PIX address space
   2645       // (which can be of the form [r0+offset]). But this is currently
   2646       // unnecessary.
   2647       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
   2648       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
   2649       break;
   2650    case SV_SAMPLE_POS: {
   2651       Value *off = new_LValue(func, FILE_GPR);
   2652       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
   2653       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
   2654       bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
   2655       bld.mkLoad(TYPE_F32,
   2656                  i->getDef(0),
   2657                  bld.mkSymbol(
   2658                        FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
   2659                        TYPE_U32, prog->driver->io.sampleInfoBase +
   2660                        4 * sym->reg.data.sv.index),
   2661                  off);
   2662       break;
   2663    }
   2664    case SV_SAMPLE_MASK: {
   2665       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
   2666       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
   2667       Instruction *sampleid =
   2668          bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
   2669       sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
   2670       Value *masked =
   2671          bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
   2672                     bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
   2673                                bld.loadImm(NULL, 1), sampleid->getDef(0)));
   2674       if (prog->driver->prop.fp.persampleInvocation) {
   2675          bld.mkMov(i->getDef(0), masked);
   2676       } else {
   2677          bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
   2678                    bld.mkImm(0))
   2679             ->subOp = 1;
   2680       }
   2681       break;
   2682    }
   2683    case SV_BASEVERTEX:
   2684    case SV_BASEINSTANCE:
   2685    case SV_DRAWID:
   2686       ld = bld.mkLoad(TYPE_U32, i->getDef(0),
   2687                       bld.mkSymbol(FILE_MEMORY_CONST,
   2688                                    prog->driver->io.auxCBSlot,
   2689                                    TYPE_U32,
   2690                                    prog->driver->io.drawInfoBase +
   2691                                    4 * (sv - SV_BASEVERTEX)),
   2692                       NULL);
   2693       break;
   2694    default:
   2695       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
   2696          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
   2697       if (prog->getType() == Program::TYPE_FRAGMENT) {
   2698          bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
   2699       } else {
   2700          ld = bld.mkFetch(i->getDef(0), i->dType,
   2701                           FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
   2702          ld->perPatch = i->perPatch;
   2703       }
   2704       break;
   2705    }
   2706    bld.getBB()->remove(i);
   2707    return true;
   2708 }
   2709 
   2710 bool
   2711 NVC0LoweringPass::handleDIV(Instruction *i)
   2712 {
   2713    if (!isFloatType(i->dType))
   2714       return true;
   2715    bld.setPosition(i, false);
   2716    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
   2717    i->op = OP_MUL;
   2718    i->setSrc(1, rcp->getDef(0));
   2719    return true;
   2720 }
   2721 
   2722 bool
   2723 NVC0LoweringPass::handleMOD(Instruction *i)
   2724 {
   2725    if (!isFloatType(i->dType))
   2726       return true;
   2727    LValue *value = bld.getScratch(typeSizeof(i->dType));
   2728    bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
   2729    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
   2730    bld.mkOp1(OP_TRUNC, i->dType, value, value);
   2731    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
   2732    i->op = OP_SUB;
   2733    i->setSrc(1, value);
   2734    return true;
   2735 }
   2736 
   2737 bool
   2738 NVC0LoweringPass::handleSQRT(Instruction *i)
   2739 {
   2740    if (i->dType == TYPE_F64) {
   2741       Value *pred = bld.getSSA(1, FILE_PREDICATE);
   2742       Value *zero = bld.loadImm(NULL, 0.0);
   2743       Value *dst = bld.getSSA(8);
   2744       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
   2745       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
   2746       bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
   2747       i->op = OP_MUL;
   2748       i->setSrc(1, dst);
   2749       // TODO: Handle this properly with a library function
   2750    } else {
   2751       bld.setPosition(i, true);
   2752       i->op = OP_RSQ;
   2753       bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
   2754    }
   2755 
   2756    return true;
   2757 }
   2758 
   2759 bool
   2760 NVC0LoweringPass::handlePOW(Instruction *i)
   2761 {
   2762    LValue *val = bld.getScratch();
   2763 
   2764    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
   2765    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
   2766    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
   2767 
   2768    i->op = OP_EX2;
   2769    i->setSrc(0, val);
   2770    i->setSrc(1, NULL);
   2771 
   2772    return true;
   2773 }
   2774 
   2775 bool
   2776 NVC0LoweringPass::handleEXPORT(Instruction *i)
   2777 {
   2778    if (prog->getType() == Program::TYPE_FRAGMENT) {
   2779       int id = i->getSrc(0)->reg.data.offset / 4;
   2780 
   2781       if (i->src(0).isIndirect(0)) // TODO, ugly
   2782          return false;
   2783       i->op = OP_MOV;
   2784       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
   2785       i->src(0).set(i->src(1));
   2786       i->setSrc(1, NULL);
   2787       i->setDef(0, new_LValue(func, FILE_GPR));
   2788       i->getDef(0)->reg.data.id = id;
   2789 
   2790       prog->maxGPR = MAX2(prog->maxGPR, id);
   2791    } else
   2792    if (prog->getType() == Program::TYPE_GEOMETRY) {
   2793       i->setIndirect(0, 1, gpEmitAddress);
   2794    }
   2795    return true;
   2796 }
   2797 
   2798 bool
   2799 NVC0LoweringPass::handleOUT(Instruction *i)
   2800 {
   2801    Instruction *prev = i->prev;
   2802    ImmediateValue stream, prevStream;
   2803 
   2804    // Only merge if the stream ids match. Also, note that the previous
   2805    // instruction would have already been lowered, so we take arg1 from it.
   2806    if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
   2807        i->src(0).getImmediate(stream) &&
   2808        prev->src(1).getImmediate(prevStream) &&
   2809        stream.reg.data.u32 == prevStream.reg.data.u32) {
   2810       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
   2811       delete_Instruction(prog, i);
   2812    } else {
   2813       assert(gpEmitAddress);
   2814       i->setDef(0, gpEmitAddress);
   2815       i->setSrc(1, i->getSrc(0));
   2816       i->setSrc(0, gpEmitAddress);
   2817    }
   2818    return true;
   2819 }
   2820 
   2821 // Generate a binary predicate if an instruction is predicated by
   2822 // e.g. an f32 value.
   2823 void
   2824 NVC0LoweringPass::checkPredicate(Instruction *insn)
   2825 {
   2826    Value *pred = insn->getPredicate();
   2827    Value *pdst;
   2828 
   2829    if (!pred || pred->reg.file == FILE_PREDICATE)
   2830       return;
   2831    pdst = new_LValue(func, FILE_PREDICATE);
   2832 
   2833    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
   2834    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
   2835 
   2836    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
   2837 
   2838    insn->setPredicate(insn->cc, pdst);
   2839 }
   2840 
   2841 //
   2842 // - add quadop dance for texturing
   2843 // - put FP outputs in GPRs
   2844 // - convert instruction sequences
   2845 //
   2846 bool
   2847 NVC0LoweringPass::visit(Instruction *i)
   2848 {
   2849    bool ret = true;
   2850    bld.setPosition(i, false);
   2851 
   2852    if (i->cc != CC_ALWAYS)
   2853       checkPredicate(i);
   2854 
   2855    switch (i->op) {
   2856    case OP_TEX:
   2857    case OP_TXB:
   2858    case OP_TXL:
   2859    case OP_TXF:
   2860    case OP_TXG:
   2861       return handleTEX(i->asTex());
   2862    case OP_TXD:
   2863       return handleTXD(i->asTex());
   2864    case OP_TXLQ:
   2865       return handleTXLQ(i->asTex());
   2866    case OP_TXQ:
   2867      return handleTXQ(i->asTex());
   2868    case OP_EX2:
   2869       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
   2870       i->setSrc(0, i->getDef(0));
   2871       break;
   2872    case OP_POW:
   2873       return handlePOW(i);
   2874    case OP_DIV:
   2875       return handleDIV(i);
   2876    case OP_MOD:
   2877       return handleMOD(i);
   2878    case OP_SQRT:
   2879       return handleSQRT(i);
   2880    case OP_EXPORT:
   2881       ret = handleEXPORT(i);
   2882       break;
   2883    case OP_EMIT:
   2884    case OP_RESTART:
   2885       return handleOUT(i);
   2886    case OP_RDSV:
   2887       return handleRDSV(i);
   2888    case OP_WRSV:
   2889       return handleWRSV(i);
   2890    case OP_STORE:
   2891    case OP_LOAD:
   2892       handleLDST(i);
   2893       break;
   2894    case OP_ATOM:
   2895    {
   2896       const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
   2897       handleATOM(i);
   2898       handleCasExch(i, cctl);
   2899    }
   2900       break;
   2901    case OP_SULDB:
   2902    case OP_SULDP:
   2903    case OP_SUSTB:
   2904    case OP_SUSTP:
   2905    case OP_SUREDB:
   2906    case OP_SUREDP:
   2907       if (targ->getChipset() >= NVISA_GM107_CHIPSET)
   2908          handleSurfaceOpGM107(i->asTex());
   2909       else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
   2910          handleSurfaceOpNVE4(i->asTex());
   2911       else
   2912          handleSurfaceOpNVC0(i->asTex());
   2913       break;
   2914    case OP_SUQ:
   2915       handleSUQ(i->asTex());
   2916       break;
   2917    case OP_BUFQ:
   2918       handleBUFQ(i);
   2919       break;
   2920    default:
   2921       break;
   2922    }
   2923 
   2924    /* Kepler+ has a special opcode to compute a new base address to be used
   2925     * for indirect loads.
   2926     *
   2927     * Maxwell+ has an additional similar requirement for indirect
   2928     * interpolation ops in frag shaders.
   2929     */
   2930    bool doAfetch = false;
   2931    if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
   2932        !i->perPatch &&
   2933        (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
   2934        i->src(0).isIndirect(0)) {
   2935       doAfetch = true;
   2936    }
   2937    if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
   2938        (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
   2939        i->src(0).isIndirect(0)) {
   2940       doAfetch = true;
   2941    }
   2942 
   2943    if (doAfetch) {
   2944       Value *addr = cloneShallow(func, i->getSrc(0));
   2945       Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
   2946                                       i->getSrc(0));
   2947       afetch->setIndirect(0, 0, i->getIndirect(0, 0));
   2948       addr->reg.data.offset = 0;
   2949       i->setSrc(0, addr);
   2950       i->setIndirect(0, 0, afetch->getDef(0));
   2951    }
   2952 
   2953    return ret;
   2954 }
   2955 
   2956 bool
   2957 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
   2958 {
   2959    if (stage == CG_STAGE_PRE_SSA) {
   2960       NVC0LoweringPass pass(prog);
   2961       return pass.run(prog, false, true);
   2962    } else
   2963    if (stage == CG_STAGE_POST_RA) {
   2964       NVC0LegalizePostRA pass(prog);
   2965       return pass.run(prog, false, true);
   2966    } else
   2967    if (stage == CG_STAGE_SSA) {
   2968       NVC0LegalizeSSA pass;
   2969       return pass.run(prog, false, true);
   2970    }
   2971    return false;
   2972 }
   2973 
   2974 } // namespace nv50_ir
   2975