Home | History | Annotate | Download | only in codegen
      1 /*
      2  * Copyright 2011 Christoph Bumiller
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "codegen/nv50_ir.h"
     24 #include "codegen/nv50_ir_build_util.h"
     25 
     26 #include "codegen/nv50_ir_target_nv50.h"
     27 
     28 namespace nv50_ir {
     29 
     30 // nv50 doesn't support 32 bit integer multiplication
     31 //
     32 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
     33 // -------------------
     34 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
     35 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
     36 //       al*bl
     37 //    ah*bl 00
     38 //
     39 // fffe0001 + fffe0001
     40 //
     41 // Note that this sort of splitting doesn't work for signed values, so we
     42 // compute the sign on those manually and then perform an unsigned multiply.
     43 static bool
     44 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
     45 {
     46    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
     47    ImmediateValue src1;
     48    bool src1imm = mul->src(1).getImmediate(src1);
     49 
     50    DataType fTy; // full type
     51    switch (mul->sType) {
     52    case TYPE_S32: fTy = TYPE_U32; break;
     53    case TYPE_S64: fTy = TYPE_U64; break;
     54    default: fTy = mul->sType; break;
     55    }
     56 
     57    DataType hTy; // half type
     58    switch (fTy) {
     59    case TYPE_U32: hTy = TYPE_U16; break;
     60    case TYPE_U64: hTy = TYPE_U32; break;
     61    default:
     62       return false;
     63    }
     64    unsigned int fullSize = typeSizeof(fTy);
     65    unsigned int halfSize = typeSizeof(hTy);
     66 
     67    Instruction *i[9];
     68 
     69    bld->setPosition(mul, true);
     70 
     71    Value *s[2];
     72    Value *a[2], *b[2];
     73    Value *t[4];
     74    for (int j = 0; j < 4; ++j)
     75       t[j] = bld->getSSA(fullSize);
     76 
     77    if (isSignedType(mul->sType) && highResult) {
     78       s[0] = bld->getSSA(fullSize);
     79       s[1] = bld->getSSA(fullSize);
     80       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
     81       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
     82       src1.reg.data.s32 = abs(src1.reg.data.s32);
     83    } else {
     84       s[0] = mul->getSrc(0);
     85       s[1] = mul->getSrc(1);
     86    }
     87 
     88    // split sources into halves
     89    i[0] = bld->mkSplit(a, halfSize, s[0]);
     90    i[1] = bld->mkSplit(b, halfSize, s[1]);
     91 
     92    if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
     93       i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
     94                                bld->mkImm(src1.reg.data.u32 & 0xffff));
     95    } else {
     96       i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
     97                         src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
     98       if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
     99          i[3] = i[2];
    100          t[1] = t[0];
    101       } else {
    102          i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
    103       }
    104    }
    105    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
    106    if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
    107       i[4] = i[3];
    108       t[3] = t[2];
    109    } else {
    110       i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
    111    }
    112 
    113    if (highResult) {
    114       Value *c[2];
    115       Value *r[5];
    116       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
    117       c[0] = bld->getSSA(1, FILE_FLAGS);
    118       c[1] = bld->getSSA(1, FILE_FLAGS);
    119       for (int j = 0; j < 5; ++j)
    120          r[j] = bld->getSSA(fullSize);
    121 
    122       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
    123       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
    124       bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
    125       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
    126       i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
    127 
    128       // set carry defs / sources
    129       i[3]->setFlagsDef(1, c[0]);
    130       // actual result required in negative case, but ignored for
    131       // unsigned. for some reason the compiler ends up dropping the whole
    132       // instruction if the destination is unused but the flags are.
    133       if (isSignedType(mul->sType))
    134          i[4]->setFlagsDef(1, c[1]);
    135       else
    136          i[4]->setFlagsDef(0, c[1]);
    137       i[6]->setPredicate(CC_C, c[0]);
    138       i[5]->setFlagsSrc(3, c[1]);
    139 
    140       if (isSignedType(mul->sType)) {
    141          Value *cc[2];
    142          Value *rr[7];
    143          Value *one = bld->getSSA(fullSize);
    144          bld->loadImm(one, 1);
    145          for (int j = 0; j < 7; j++)
    146             rr[j] = bld->getSSA(fullSize);
    147 
    148          // NOTE: this logic uses predicates because splitting basic blocks is
    149          // ~impossible during the SSA phase. The RA relies on a correlation
    150          // between edge order and phi node sources.
    151 
    152          // Set the sign of the result based on the inputs
    153          bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
    154             ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
    155 
    156          // 1s complement of 64-bit value
    157          bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
    158             ->setPredicate(CC_S, cc[0]);
    159          bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
    160             ->setPredicate(CC_S, cc[0]);
    161 
    162          // add to low 32-bits, keep track of the carry
    163          Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
    164          n->setPredicate(CC_S, cc[0]);
    165          n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
    166 
    167          // If there was a carry, add 1 to the upper 32 bits
    168          // XXX: These get executed even if they shouldn't be
    169          bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
    170             ->setPredicate(CC_C, cc[1]);
    171          bld->mkMov(rr[3], rr[0])
    172             ->setPredicate(CC_NC, cc[1]);
    173          bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
    174 
    175          // Merge the results from the negative and non-negative paths
    176          bld->mkMov(rr[5], rr[4])
    177             ->setPredicate(CC_S, cc[0]);
    178          bld->mkMov(rr[6], r[4])
    179             ->setPredicate(CC_NS, cc[0]);
    180          bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
    181       } else {
    182          bld->mkMov(mul->getDef(0), r[4]);
    183       }
    184    } else {
    185       bld->mkMov(mul->getDef(0), t[3]);
    186    }
    187    delete_Instruction(bld->getProgram(), mul);
    188 
    189    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
    190       if (i[j])
    191          i[j]->sType = hTy;
    192 
    193    return true;
    194 }
    195 
    196 #define QOP_ADD  0
    197 #define QOP_SUBR 1
    198 #define QOP_SUB  2
    199 #define QOP_MOV2 3
    200 
    201 //             UL UR LL LR
    202 #define QUADOP(q, r, s, t)            \
    203    ((QOP_##q << 6) | (QOP_##r << 4) | \
    204     (QOP_##s << 2) | (QOP_##t << 0))
    205 
    206 class NV50LegalizePostRA : public Pass
    207 {
    208 private:
    209    virtual bool visit(Function *);
    210    virtual bool visit(BasicBlock *);
    211 
    212    void handlePRERET(FlowInstruction *);
    213    void replaceZero(Instruction *);
    214 
    215    LValue *r63;
    216 };
    217 
    218 bool
    219 NV50LegalizePostRA::visit(Function *fn)
    220 {
    221    Program *prog = fn->getProgram();
    222 
    223    r63 = new_LValue(fn, FILE_GPR);
    224    // GPR units on nv50 are in half-regs
    225    if (prog->maxGPR < 126)
    226       r63->reg.data.id = 63;
    227    else
    228       r63->reg.data.id = 127;
    229 
    230    // this is actually per-program, but we can do it all on visiting main()
    231    std::list<Instruction *> *outWrites =
    232       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
    233 
    234    if (outWrites) {
    235       for (std::list<Instruction *>::iterator it = outWrites->begin();
    236            it != outWrites->end(); ++it)
    237          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
    238       // instructions will be deleted on exit
    239       outWrites->clear();
    240    }
    241 
    242    return true;
    243 }
    244 
    245 void
    246 NV50LegalizePostRA::replaceZero(Instruction *i)
    247 {
    248    for (int s = 0; i->srcExists(s); ++s) {
    249       ImmediateValue *imm = i->getSrc(s)->asImm();
    250       if (imm && imm->reg.data.u64 == 0)
    251          i->setSrc(s, r63);
    252    }
    253 }
    254 
    255 // Emulate PRERET: jump to the target and call to the origin from there
    256 //
    257 // WARNING: atm only works if BBs are affected by at most a single PRERET
    258 //
    259 // BB:0
    260 // preret BB:3
    261 // (...)
    262 // BB:3
    263 // (...)
    264 //             --->
    265 // BB:0
    266 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
    267 // (...)
    268 // BB:3
    269 // bra BB:3 + n1 (skip the call)
    270 // call BB:0 + n2 (skip bra at beginning of BB:0)
    271 // (...)
    272 void
    273 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
    274 {
    275    BasicBlock *bbE = pre->bb;
    276    BasicBlock *bbT = pre->target.bb;
    277 
    278    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
    279    bbE->remove(pre);
    280    bbE->insertHead(pre);
    281 
    282    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
    283    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
    284 
    285    bbT->insertHead(call);
    286    bbT->insertHead(skip);
    287 
    288    // NOTE: maybe split blocks to prevent the instructions from moving ?
    289 
    290    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
    291    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
    292 }
    293 
    294 bool
    295 NV50LegalizePostRA::visit(BasicBlock *bb)
    296 {
    297    Instruction *i, *next;
    298 
    299    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
    300    for (i = bb->getFirst(); i; i = next) {
    301       next = i->next;
    302       if (i->isNop()) {
    303          bb->remove(i);
    304       } else
    305       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
    306          handlePRERET(i->asFlow());
    307       } else {
    308          // TODO: We will want to do this before register allocation,
    309          // since have to use a $c register for the carry flag.
    310          if (typeSizeof(i->dType) == 8) {
    311             Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
    312             if (hi)
    313                next = hi;
    314          }
    315 
    316          if (i->op != OP_PFETCH && i->op != OP_BAR &&
    317              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
    318             replaceZero(i);
    319       }
    320    }
    321    if (!bb->getEntry())
    322       return true;
    323 
    324    return true;
    325 }
    326 
    327 class NV50LegalizeSSA : public Pass
    328 {
    329 public:
    330    NV50LegalizeSSA(Program *);
    331 
    332    virtual bool visit(BasicBlock *bb);
    333 
    334 private:
    335    void propagateWriteToOutput(Instruction *);
    336    void handleDIV(Instruction *);
    337    void handleMOD(Instruction *);
    338    void handleMUL(Instruction *);
    339    void handleAddrDef(Instruction *);
    340 
    341    inline bool isARL(const Instruction *) const;
    342 
    343    BuildUtil bld;
    344 
    345    std::list<Instruction *> *outWrites;
    346 };
    347 
    348 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
    349 {
    350    bld.setProgram(prog);
    351 
    352    if (prog->optLevel >= 2 &&
    353        (prog->getType() == Program::TYPE_GEOMETRY ||
    354         prog->getType() == Program::TYPE_VERTEX))
    355       outWrites =
    356          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
    357    else
    358       outWrites = NULL;
    359 }
    360 
    361 void
    362 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
    363 {
    364    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
    365       return;
    366 
    367    // check def instruction can store
    368    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
    369 
    370    // TODO: move exports (if beneficial) in common opt pass
    371    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
    372       return;
    373 
    374    for (int s = 0; di->srcExists(s); ++s)
    375       if (di->src(s).getFile() == FILE_IMMEDIATE ||
    376           di->src(s).getFile() == FILE_MEMORY_LOCAL)
    377          return;
    378 
    379    if (prog->getType() == Program::TYPE_GEOMETRY) {
    380       // Only propagate output writes in geometry shaders when we can be sure
    381       // that we are propagating to the same output vertex.
    382       if (di->bb != st->bb)
    383          return;
    384       Instruction *i;
    385       for (i = di; i != st; i = i->next) {
    386          if (i->op == OP_EMIT || i->op == OP_RESTART)
    387             return;
    388       }
    389       assert(i); // st after di
    390    }
    391 
    392    // We cannot set defs to non-lvalues before register allocation, so
    393    // save & remove (to save registers) the exports and replace later.
    394    outWrites->push_back(st);
    395    st->bb->remove(st);
    396 }
    397 
    398 bool
    399 NV50LegalizeSSA::isARL(const Instruction *i) const
    400 {
    401    ImmediateValue imm;
    402 
    403    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
    404       return false;
    405    if (!i->src(1).getImmediate(imm))
    406       return false;
    407    return imm.isInteger(0);
    408 }
    409 
    410 void
    411 NV50LegalizeSSA::handleAddrDef(Instruction *i)
    412 {
    413    Instruction *arl;
    414 
    415    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
    416 
    417    // PFETCH can always write to $a
    418    if (i->op == OP_PFETCH)
    419       return;
    420    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
    421    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
    422       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
    423          return;
    424       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
    425          return;
    426    }
    427 
    428    // turn $a sources into $r sources (can't operate on $a)
    429    for (int s = 0; i->srcExists(s); ++s) {
    430       Value *a = i->getSrc(s);
    431       Value *r;
    432       if (a->reg.file == FILE_ADDRESS) {
    433          if (a->getInsn() && isARL(a->getInsn())) {
    434             i->setSrc(s, a->getInsn()->getSrc(0));
    435          } else {
    436             bld.setPosition(i, false);
    437             r = bld.getSSA();
    438             bld.mkMov(r, a);
    439             i->setSrc(s, r);
    440          }
    441       }
    442    }
    443    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
    444       return;
    445 
    446    // turn result back into $a
    447    bld.setPosition(i, true);
    448    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
    449    i->setDef(0, arl->getSrc(0));
    450 }
    451 
    452 void
    453 NV50LegalizeSSA::handleMUL(Instruction *mul)
    454 {
    455    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
    456       return;
    457    Value *def = mul->getDef(0);
    458    Value *pred = mul->getPredicate();
    459    CondCode cc = mul->cc;
    460    if (pred)
    461       mul->setPredicate(CC_ALWAYS, NULL);
    462 
    463    if (mul->op == OP_MAD) {
    464       Instruction *add = mul;
    465       bld.setPosition(add, false);
    466       Value *res = cloneShallow(func, mul->getDef(0));
    467       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
    468       add->op = OP_ADD;
    469       add->setSrc(0, mul->getDef(0));
    470       add->setSrc(1, add->getSrc(2));
    471       for (int s = 2; add->srcExists(s); ++s)
    472          add->setSrc(s, NULL);
    473       mul->subOp = add->subOp;
    474       add->subOp = 0;
    475    }
    476    expandIntegerMUL(&bld, mul);
    477    if (pred)
    478       def->getInsn()->setPredicate(cc, pred);
    479 }
    480 
    481 // Use f32 division: first compute an approximate result, use it to reduce
    482 // the dividend, which should then be representable as f32, divide the reduced
    483 // dividend, and add the quotients.
    484 void
    485 NV50LegalizeSSA::handleDIV(Instruction *div)
    486 {
    487    const DataType ty = div->sType;
    488 
    489    if (ty != TYPE_U32 && ty != TYPE_S32)
    490       return;
    491 
    492    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
    493 
    494    bld.setPosition(div, false);
    495 
    496    Value *a, *af = bld.getSSA();
    497    Value *b, *bf = bld.getSSA();
    498 
    499    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
    500    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
    501 
    502    if (isSignedType(ty)) {
    503       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
    504       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
    505       a = bld.getSSA();
    506       b = bld.getSSA();
    507       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
    508       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
    509    } else {
    510       a = div->getSrc(0);
    511       b = div->getSrc(1);
    512    }
    513 
    514    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
    515    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
    516 
    517    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
    518    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
    519 
    520    // get error of 1st result
    521    expandIntegerMUL(&bld,
    522       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
    523    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
    524 
    525    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
    526 
    527    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
    528    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
    529       ->rnd = ROUND_Z;
    530    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
    531 
    532    // correction: if modulus >= divisor, add 1
    533    expandIntegerMUL(&bld,
    534       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
    535    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
    536    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
    537    if (!isSignedType(ty)) {
    538       div->op = OP_SUB;
    539       div->setSrc(0, q);
    540       div->setSrc(1, s);
    541    } else {
    542       t = q;
    543       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
    544       s = bld.getSSA();
    545       t = bld.getSSA();
    546       // fix the sign
    547       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
    548          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
    549       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
    550       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
    551 
    552       div->op = OP_UNION;
    553       div->setSrc(0, s);
    554       div->setSrc(1, t);
    555    }
    556 }
    557 
    558 void
    559 NV50LegalizeSSA::handleMOD(Instruction *mod)
    560 {
    561    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
    562       return;
    563    bld.setPosition(mod, false);
    564 
    565    Value *q = bld.getSSA();
    566    Value *m = bld.getSSA();
    567 
    568    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
    569    handleDIV(q->getInsn());
    570 
    571    bld.setPosition(mod, false);
    572    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
    573 
    574    mod->op = OP_SUB;
    575    mod->setSrc(1, m);
    576 }
    577 
    578 bool
    579 NV50LegalizeSSA::visit(BasicBlock *bb)
    580 {
    581    Instruction *insn, *next;
    582    // skipping PHIs (don't pass them to handleAddrDef) !
    583    for (insn = bb->getEntry(); insn; insn = next) {
    584       next = insn->next;
    585 
    586       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
    587          handleAddrDef(insn);
    588 
    589       switch (insn->op) {
    590       case OP_EXPORT:
    591          if (outWrites)
    592             propagateWriteToOutput(insn);
    593          break;
    594       case OP_DIV:
    595          handleDIV(insn);
    596          break;
    597       case OP_MOD:
    598          handleMOD(insn);
    599          break;
    600       case OP_MAD:
    601       case OP_MUL:
    602          handleMUL(insn);
    603          break;
    604       default:
    605          break;
    606       }
    607    }
    608    return true;
    609 }
    610 
    611 class NV50LoweringPreSSA : public Pass
    612 {
    613 public:
    614    NV50LoweringPreSSA(Program *);
    615 
    616 private:
    617    virtual bool visit(Instruction *);
    618    virtual bool visit(Function *);
    619 
    620    bool handleRDSV(Instruction *);
    621    bool handleWRSV(Instruction *);
    622 
    623    bool handlePFETCH(Instruction *);
    624    bool handleEXPORT(Instruction *);
    625    bool handleLOAD(Instruction *);
    626 
    627    bool handleDIV(Instruction *);
    628    bool handleSQRT(Instruction *);
    629    bool handlePOW(Instruction *);
    630 
    631    bool handleSET(Instruction *);
    632    bool handleSLCT(CmpInstruction *);
    633    bool handleSELP(Instruction *);
    634 
    635    bool handleTEX(TexInstruction *);
    636    bool handleTXB(TexInstruction *); // I really
    637    bool handleTXL(TexInstruction *); // hate
    638    bool handleTXD(TexInstruction *); // these 3
    639    bool handleTXLQ(TexInstruction *);
    640    bool handleTXQ(TexInstruction *);
    641 
    642    bool handleCALL(Instruction *);
    643    bool handlePRECONT(Instruction *);
    644    bool handleCONT(Instruction *);
    645 
    646    void checkPredicate(Instruction *);
    647    void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
    648    void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
    649 
    650 private:
    651    const Target *const targ;
    652 
    653    BuildUtil bld;
    654 
    655    Value *tid;
    656 };
    657 
    658 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
    659    targ(prog->getTarget()), tid(NULL)
    660 {
    661    bld.setProgram(prog);
    662 }
    663 
    664 bool
    665 NV50LoweringPreSSA::visit(Function *f)
    666 {
    667    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
    668 
    669    if (prog->getType() == Program::TYPE_COMPUTE) {
    670       // Add implicit "thread id" argument in $r0 to the function
    671       Value *arg = new_LValue(func, FILE_GPR);
    672       arg->reg.data.id = 0;
    673       f->ins.push_back(arg);
    674 
    675       bld.setPosition(root, false);
    676       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
    677    }
    678 
    679    return true;
    680 }
    681 
    682 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
    683                                        Value **ms_x, Value **ms_y) {
    684    // This loads the texture-indexed ms setting from the constant buffer
    685    Value *tmp = new_LValue(func, FILE_GPR);
    686    uint8_t b = prog->driver->io.auxCBSlot;
    687    off += prog->driver->io.suInfoBase;
    688    if (prog->getType() > Program::TYPE_VERTEX)
    689       off += 16 * 2 * 4;
    690    if (prog->getType() > Program::TYPE_GEOMETRY)
    691       off += 16 * 2 * 4;
    692    *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
    693                              FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
    694    *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
    695                              FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
    696    *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
    697 }
    698 
    699 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
    700    // Given a MS level, and a sample id, compute the delta x/y
    701    uint8_t b = prog->driver->io.msInfoCBSlot;
    702    Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
    703 
    704    // The required information is at mslevel * 16 * 4 + sample * 8
    705    // = (mslevel * 8 + sample) * 8
    706    bld.mkOp2(OP_SHL,
    707              TYPE_U32,
    708              off,
    709              bld.mkOp2v(OP_ADD, TYPE_U32, t,
    710                         bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
    711                         s),
    712              bld.mkImm(3));
    713    *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
    714                            FILE_MEMORY_CONST, b, TYPE_U32,
    715                            prog->driver->io.msInfoBase), off);
    716    *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
    717                            FILE_MEMORY_CONST, b, TYPE_U32,
    718                            prog->driver->io.msInfoBase + 4), off);
    719 }
    720 
    721 bool
    722 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
    723 {
    724    const int arg = i->tex.target.getArgCount();
    725    const int dref = arg;
    726    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
    727 
    728    /* Only normalize in the non-explicit derivatives case.
    729     */
    730    if (i->tex.target.isCube() && i->op != OP_TXD) {
    731       Value *src[3], *val;
    732       int c;
    733       for (c = 0; c < 3; ++c)
    734          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
    735       val = bld.getScratch();
    736       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
    737       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
    738       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
    739       for (c = 0; c < 3; ++c) {
    740          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
    741                                  i->getSrc(c), val));
    742       }
    743    }
    744 
    745    // handle MS, which means looking up the MS params for this texture, and
    746    // adjusting the input coordinates to point at the right sample.
    747    if (i->tex.target.isMS()) {
    748       Value *x = i->getSrc(0);
    749       Value *y = i->getSrc(1);
    750       Value *s = i->getSrc(arg - 1);
    751       Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
    752          *ms, *ms_x, *ms_y, *dx, *dy;
    753 
    754       i->tex.target.clearMS();
    755 
    756       loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
    757       loadMsInfo(ms, s, &dx, &dy);
    758 
    759       bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
    760       bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
    761       bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
    762       bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
    763       i->setSrc(0, tx);
    764       i->setSrc(1, ty);
    765       i->setSrc(arg - 1, bld.loadImm(NULL, 0));
    766    }
    767 
    768    // dref comes before bias/lod
    769    if (i->tex.target.isShadow())
    770       if (i->op == OP_TXB || i->op == OP_TXL)
    771          i->swapSources(dref, lod);
    772 
    773    if (i->tex.target.isArray()) {
    774       if (i->op != OP_TXF) {
    775          // array index must be converted to u32, but it's already an integer
    776          // for TXF
    777          Value *layer = i->getSrc(arg - 1);
    778          LValue *src = new_LValue(func, FILE_GPR);
    779          bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
    780          bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
    781          i->setSrc(arg - 1, src);
    782       }
    783       if (i->tex.target.isCube() && i->srcCount() > 4) {
    784          std::vector<Value *> acube, a2d;
    785          int c;
    786 
    787          acube.resize(4);
    788          for (c = 0; c < 4; ++c)
    789             acube[c] = i->getSrc(c);
    790          a2d.resize(4);
    791          for (c = 0; c < 3; ++c)
    792             a2d[c] = new_LValue(func, FILE_GPR);
    793          a2d[3] = NULL;
    794 
    795          bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
    796                    a2d, acube)->asTex()->tex.mask = 0x7;
    797 
    798          for (c = 0; c < 3; ++c)
    799             i->setSrc(c, a2d[c]);
    800          for (; i->srcExists(c + 1); ++c)
    801             i->setSrc(c, i->getSrc(c + 1));
    802          i->setSrc(c, NULL);
    803          assert(c <= 4);
    804 
    805          i->tex.target = i->tex.target.isShadow() ?
    806             TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
    807       }
    808    }
    809 
    810    // texel offsets are 3 immediate fields in the instruction,
    811    // nv50 cannot do textureGatherOffsets
    812    assert(i->tex.useOffsets <= 1);
    813    if (i->tex.useOffsets) {
    814       for (int c = 0; c < 3; ++c) {
    815          ImmediateValue val;
    816          if (!i->offset[0][c].getImmediate(val))
    817             assert(!"non-immediate offset");
    818          i->tex.offset[c] = val.reg.data.u32;
    819          i->offset[0][c].set(NULL);
    820       }
    821    }
    822 
    823    return true;
    824 }
    825 
    826 // Bias must be equal for all threads of a quad or lod calculation will fail.
    827 //
    828 // The lanes of a quad are grouped by the bit in the condition register they
    829 // have set, which is selected by differing bias values.
    830 // Move the input values for TEX into a new register set for each group and
    831 // execute TEX only for a specific group.
    832 // We always need to use 4 new registers for the inputs/outputs because the
    833 // implicitly calculated derivatives must be correct.
    834 //
    835 // TODO: move to SSA phase so we can easily determine whether bias is constant
    836 bool
    837 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
    838 {
    839    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
    840    int l, d;
    841 
    842    // We can't actually apply bias *and* do a compare for a cube
    843    // texture. Since the compare has to be done before the filtering, just
    844    // drop the bias on the floor.
    845    if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
    846       i->op = OP_TEX;
    847       i->setSrc(3, i->getSrc(4));
    848       i->setSrc(4, NULL);
    849       return handleTEX(i);
    850    }
    851 
    852    handleTEX(i);
    853    Value *bias = i->getSrc(i->tex.target.getArgCount());
    854    if (bias->isUniform())
    855       return true;
    856 
    857    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
    858                                  bld.loadImm(NULL, 1));
    859    bld.setPosition(cond, false);
    860 
    861    for (l = 1; l < 4; ++l) {
    862       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
    863       Value *bit = bld.getSSA();
    864       Value *pred = bld.getScratch(1, FILE_FLAGS);
    865       Value *imm = bld.loadImm(NULL, (1 << l));
    866       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
    867       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
    868       cond->setSrc(l, bit);
    869    }
    870    Value *flags = bld.getScratch(1, FILE_FLAGS);
    871    bld.setPosition(cond, true);
    872    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
    873 
    874    Instruction *tex[4];
    875    for (l = 0; l < 4; ++l) {
    876       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
    877       bld.insert(tex[l]);
    878    }
    879 
    880    Value *res[4][4];
    881    for (d = 0; i->defExists(d); ++d)
    882       res[0][d] = tex[0]->getDef(d);
    883    for (l = 1; l < 4; ++l) {
    884       for (d = 0; tex[l]->defExists(d); ++d) {
    885          res[l][d] = cloneShallow(func, res[0][d]);
    886          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
    887       }
    888    }
    889 
    890    for (d = 0; i->defExists(d); ++d) {
    891       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
    892       for (l = 0; l < 4; ++l)
    893          dst->setSrc(l, res[l][d]);
    894    }
    895    delete_Instruction(prog, i);
    896    return true;
    897 }
    898 
    899 // LOD must be equal for all threads of a quad.
    900 // Unlike with TXB, here we can just diverge since there's no LOD calculation
    901 // that would require all 4 threads' sources to be set up properly.
    902 bool
    903 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
    904 {
    905    handleTEX(i);
    906    Value *lod = i->getSrc(i->tex.target.getArgCount());
    907    if (lod->isUniform())
    908       return true;
    909 
    910    BasicBlock *currBB = i->bb;
    911    BasicBlock *texiBB = i->bb->splitBefore(i, false);
    912    BasicBlock *joinBB = i->bb->splitAfter(i);
    913 
    914    bld.setPosition(currBB, true);
    915    assert(!currBB->joinAt);
    916    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
    917 
    918    for (int l = 0; l <= 3; ++l) {
    919       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
    920       Value *pred = bld.getScratch(1, FILE_FLAGS);
    921       bld.setPosition(currBB, true);
    922       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
    923       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
    924       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
    925       if (l <= 2) {
    926          BasicBlock *laneBB = new BasicBlock(func);
    927          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
    928          currBB = laneBB;
    929       }
    930    }
    931    bld.setPosition(joinBB, false);
    932    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
    933    return true;
    934 }
    935 
    936 bool
    937 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
    938 {
    939    static const uint8_t qOps[4][2] =
    940    {
    941       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
    942       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
    943       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
    944       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
    945    };
    946    Value *def[4][4];
    947    Value *crd[3];
    948    Instruction *tex;
    949    Value *zero = bld.loadImm(bld.getSSA(), 0);
    950    int l, c;
    951    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    952 
    953    handleTEX(i);
    954    i->op = OP_TEX; // no need to clone dPdx/dPdy later
    955    i->tex.derivAll = true;
    956 
    957    for (c = 0; c < dim; ++c)
    958       crd[c] = bld.getScratch();
    959 
    960    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
    961    for (l = 0; l < 4; ++l) {
    962       Value *src[3], *val;
    963       // mov coordinates from lane l to all lanes
    964       for (c = 0; c < dim; ++c)
    965          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
    966       // add dPdx from lane l to lanes dx
    967       for (c = 0; c < dim; ++c)
    968          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
    969       // add dPdy from lane l to lanes dy
    970       for (c = 0; c < dim; ++c)
    971          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
    972       // normalize cube coordinates if necessary
    973       if (i->tex.target.isCube()) {
    974          for (c = 0; c < 3; ++c)
    975             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
    976          val = bld.getScratch();
    977          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
    978          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
    979          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
    980          for (c = 0; c < 3; ++c)
    981             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
    982       } else {
    983          for (c = 0; c < dim; ++c)
    984             src[c] = crd[c];
    985       }
    986       // texture
    987       bld.insert(tex = cloneForward(func, i));
    988       for (c = 0; c < dim; ++c)
    989          tex->setSrc(c, src[c]);
    990       // save results
    991       for (c = 0; i->defExists(c); ++c) {
    992          Instruction *mov;
    993          def[c][l] = bld.getSSA();
    994          mov = bld.mkMov(def[c][l], tex->getDef(c));
    995          mov->fixed = 1;
    996          mov->lanes = 1 << l;
    997       }
    998    }
    999    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
   1000 
   1001    for (c = 0; i->defExists(c); ++c) {
   1002       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
   1003       for (l = 0; l < 4; ++l)
   1004          u->setSrc(l, def[c][l]);
   1005    }
   1006 
   1007    i->bb->remove(i);
   1008    return true;
   1009 }
   1010 
   1011 bool
   1012 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
   1013 {
   1014    handleTEX(i);
   1015    bld.setPosition(i, true);
   1016 
   1017    /* The returned values are not quite what we want:
   1018     * (a) convert from s32 to f32
   1019     * (b) multiply by 1/256
   1020     */
   1021    for (int def = 0; def < 2; ++def) {
   1022       if (!i->defExists(def))
   1023          continue;
   1024       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
   1025       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
   1026                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
   1027    }
   1028    return true;
   1029 }
   1030 
   1031 bool
   1032 NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
   1033 {
   1034    Value *ms, *ms_x, *ms_y;
   1035    if (i->tex.query == TXQ_DIMS)
   1036       return true;
   1037    assert(i->tex.query == TXQ_TYPE);
   1038    assert(i->tex.mask == 4);
   1039 
   1040    loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
   1041    bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
   1042    i->bb->remove(i);
   1043 
   1044    return true;
   1045 }
   1046 
   1047 
   1048 bool
   1049 NV50LoweringPreSSA::handleSET(Instruction *i)
   1050 {
   1051    if (i->dType == TYPE_F32) {
   1052       bld.setPosition(i, true);
   1053       i->dType = TYPE_U32;
   1054       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
   1055       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
   1056    }
   1057    return true;
   1058 }
   1059 
   1060 bool
   1061 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
   1062 {
   1063    Value *src0 = bld.getSSA();
   1064    Value *src1 = bld.getSSA();
   1065    Value *pred = bld.getScratch(1, FILE_FLAGS);
   1066 
   1067    Value *v0 = i->getSrc(0);
   1068    Value *v1 = i->getSrc(1);
   1069    // XXX: these probably shouldn't be immediates in the first place ...
   1070    if (v0->asImm())
   1071       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
   1072    if (v1->asImm())
   1073       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
   1074 
   1075    bld.setPosition(i, true);
   1076    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
   1077    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
   1078    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
   1079 
   1080    bld.setPosition(i, false);
   1081    i->op = OP_SET;
   1082    i->setFlagsDef(0, pred);
   1083    i->dType = TYPE_U8;
   1084    i->setSrc(0, i->getSrc(2));
   1085    i->setSrc(2, NULL);
   1086    i->setSrc(1, bld.loadImm(NULL, 0));
   1087 
   1088    return true;
   1089 }
   1090 
   1091 bool
   1092 NV50LoweringPreSSA::handleSELP(Instruction *i)
   1093 {
   1094    Value *src0 = bld.getSSA();
   1095    Value *src1 = bld.getSSA();
   1096 
   1097    Value *v0 = i->getSrc(0);
   1098    Value *v1 = i->getSrc(1);
   1099    if (v0->asImm())
   1100       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
   1101    if (v1->asImm())
   1102       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
   1103 
   1104    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
   1105    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
   1106    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
   1107    delete_Instruction(prog, i);
   1108    return true;
   1109 }
   1110 
   1111 bool
   1112 NV50LoweringPreSSA::handleWRSV(Instruction *i)
   1113 {
   1114    Symbol *sym = i->getSrc(0)->asSym();
   1115 
   1116    // these are all shader outputs, $sreg are not writeable
   1117    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
   1118    if (addr >= 0x400)
   1119       return false;
   1120    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
   1121 
   1122    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
   1123 
   1124    bld.getBB()->remove(i);
   1125    return true;
   1126 }
   1127 
   1128 bool
   1129 NV50LoweringPreSSA::handleCALL(Instruction *i)
   1130 {
   1131    if (prog->getType() == Program::TYPE_COMPUTE) {
   1132       // Add implicit "thread id" argument in $r0 to the function
   1133       i->setSrc(i->srcCount(), tid);
   1134    }
   1135    return true;
   1136 }
   1137 
   1138 bool
   1139 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
   1140 {
   1141    delete_Instruction(prog, i);
   1142    return true;
   1143 }
   1144 
   1145 bool
   1146 NV50LoweringPreSSA::handleCONT(Instruction *i)
   1147 {
   1148    i->op = OP_BRA;
   1149    return true;
   1150 }
   1151 
   1152 bool
   1153 NV50LoweringPreSSA::handleRDSV(Instruction *i)
   1154 {
   1155    Symbol *sym = i->getSrc(0)->asSym();
   1156    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
   1157    Value *def = i->getDef(0);
   1158    SVSemantic sv = sym->reg.data.sv.sv;
   1159    int idx = sym->reg.data.sv.index;
   1160 
   1161    if (addr >= 0x400) // mov $sreg
   1162       return true;
   1163 
   1164    switch (sv) {
   1165    case SV_POSITION:
   1166       assert(prog->getType() == Program::TYPE_FRAGMENT);
   1167       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
   1168       break;
   1169    case SV_FACE:
   1170       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
   1171       if (i->dType == TYPE_F32) {
   1172          bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
   1173          bld.mkOp1(OP_NEG, TYPE_S32, def, def);
   1174          bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
   1175       }
   1176       break;
   1177    case SV_NCTAID:
   1178    case SV_CTAID:
   1179    case SV_NTID:
   1180       if ((sv == SV_NCTAID && idx >= 2) ||
   1181           (sv == SV_NTID && idx >= 3)) {
   1182          bld.mkMov(def, bld.mkImm(1));
   1183       } else if (sv == SV_CTAID && idx >= 2) {
   1184          bld.mkMov(def, bld.mkImm(0));
   1185       } else {
   1186          Value *x = bld.getSSA(2);
   1187          bld.mkOp1(OP_LOAD, TYPE_U16, x,
   1188                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
   1189          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
   1190       }
   1191       break;
   1192    case SV_TID:
   1193       if (idx == 0) {
   1194          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
   1195       } else if (idx == 1) {
   1196          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
   1197          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
   1198       } else if (idx == 2) {
   1199          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
   1200       } else {
   1201          bld.mkMov(def, bld.mkImm(0));
   1202       }
   1203       break;
   1204    case SV_SAMPLE_POS: {
   1205       Value *off = new_LValue(func, FILE_ADDRESS);
   1206       bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
   1207       bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
   1208       bld.mkLoad(TYPE_F32,
   1209                  def,
   1210                  bld.mkSymbol(
   1211                        FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
   1212                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
   1213                  off);
   1214       break;
   1215    }
   1216    default:
   1217       bld.mkFetch(i->getDef(0), i->dType,
   1218                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
   1219       break;
   1220    }
   1221    bld.getBB()->remove(i);
   1222    return true;
   1223 }
   1224 
   1225 bool
   1226 NV50LoweringPreSSA::handleDIV(Instruction *i)
   1227 {
   1228    if (!isFloatType(i->dType))
   1229       return true;
   1230    bld.setPosition(i, false);
   1231    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
   1232    i->op = OP_MUL;
   1233    i->setSrc(1, rcp->getDef(0));
   1234    return true;
   1235 }
   1236 
   1237 bool
   1238 NV50LoweringPreSSA::handleSQRT(Instruction *i)
   1239 {
   1240    bld.setPosition(i, true);
   1241    i->op = OP_RSQ;
   1242    bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
   1243 
   1244    return true;
   1245 }
   1246 
   1247 bool
   1248 NV50LoweringPreSSA::handlePOW(Instruction *i)
   1249 {
   1250    LValue *val = bld.getScratch();
   1251 
   1252    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
   1253    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
   1254    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
   1255 
   1256    i->op = OP_EX2;
   1257    i->setSrc(0, val);
   1258    i->setSrc(1, NULL);
   1259 
   1260    return true;
   1261 }
   1262 
   1263 bool
   1264 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
   1265 {
   1266    if (prog->getType() == Program::TYPE_FRAGMENT) {
   1267       if (i->getIndirect(0, 0)) {
   1268          // TODO: redirect to l[] here, load to GPRs at exit
   1269          return false;
   1270       } else {
   1271          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
   1272 
   1273          i->op = OP_MOV;
   1274          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
   1275          i->src(0).set(i->src(1));
   1276          i->setSrc(1, NULL);
   1277          i->setDef(0, new_LValue(func, FILE_GPR));
   1278          i->getDef(0)->reg.data.id = id;
   1279 
   1280          prog->maxGPR = MAX2(prog->maxGPR, id * 2);
   1281       }
   1282    }
   1283    return true;
   1284 }
   1285 
   1286 // Handle indirect addressing in geometry shaders:
   1287 //
   1288 // ld $r0 a[$a1][$a2+k] ->
   1289 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
   1290 //
   1291 bool
   1292 NV50LoweringPreSSA::handleLOAD(Instruction *i)
   1293 {
   1294    ValueRef src = i->src(0);
   1295 
   1296    if (src.isIndirect(1)) {
   1297       assert(prog->getType() == Program::TYPE_GEOMETRY);
   1298       Value *addr = i->getIndirect(0, 1);
   1299 
   1300       if (src.isIndirect(0)) {
   1301          // base address is in an address register, so move to a GPR
   1302          Value *base = bld.getScratch();
   1303          bld.mkMov(base, addr);
   1304 
   1305          Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
   1306          Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
   1307          Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
   1308                                     i->getIndirect(0, 0), bld.mkImm(2));
   1309 
   1310          // Calculate final address: addr = base + attr*vstride; use 16-bit
   1311          // multiplication since 32-bit would be lowered to multiple
   1312          // instructions, and we only need the low 16 bits of the result
   1313          Value *a[2], *b[2];
   1314          bld.mkSplit(a, 2, attrib);
   1315          bld.mkSplit(b, 2, vstride);
   1316          Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
   1317                                  base);
   1318 
   1319          // move address from GPR into an address register
   1320          addr = bld.getSSA(2, FILE_ADDRESS);
   1321          bld.mkMov(addr, sum);
   1322       }
   1323 
   1324       i->setIndirect(0, 1, NULL);
   1325       i->setIndirect(0, 0, addr);
   1326    }
   1327 
   1328    return true;
   1329 }
   1330 
   1331 bool
   1332 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
   1333 {
   1334    assert(prog->getType() == Program::TYPE_GEOMETRY);
   1335 
   1336    // NOTE: cannot use getImmediate here, not in SSA form yet, move to
   1337    // later phase if that assertion ever triggers:
   1338 
   1339    ImmediateValue *imm = i->getSrc(0)->asImm();
   1340    assert(imm);
   1341 
   1342    assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
   1343 
   1344    if (i->srcExists(1)) {
   1345       // indirect addressing of vertex in primitive space
   1346 
   1347       LValue *val = bld.getScratch();
   1348       Value *ptr = bld.getSSA(2, FILE_ADDRESS);
   1349       bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
   1350       bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
   1351 
   1352       // NOTE: PFETCH directly to an $aX only works with direct addressing
   1353       i->op = OP_SHL;
   1354       i->setSrc(0, val);
   1355       i->setSrc(1, bld.mkImm(0));
   1356    }
   1357 
   1358    return true;
   1359 }
   1360 
   1361 // Set flags according to predicate and make the instruction read $cX.
   1362 void
   1363 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
   1364 {
   1365    Value *pred = insn->getPredicate();
   1366    Value *cdst;
   1367 
   1368    // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
   1369    if (!pred ||
   1370        pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
   1371       return;
   1372 
   1373    cdst = bld.getSSA(1, FILE_FLAGS);
   1374 
   1375    bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
   1376 
   1377    insn->setPredicate(insn->cc, cdst);
   1378 }
   1379 
   1380 //
   1381 // - add quadop dance for texturing
   1382 // - put FP outputs in GPRs
   1383 // - convert instruction sequences
   1384 //
   1385 bool
   1386 NV50LoweringPreSSA::visit(Instruction *i)
   1387 {
   1388    bld.setPosition(i, false);
   1389 
   1390    if (i->cc != CC_ALWAYS)
   1391       checkPredicate(i);
   1392 
   1393    switch (i->op) {
   1394    case OP_TEX:
   1395    case OP_TXF:
   1396    case OP_TXG:
   1397       return handleTEX(i->asTex());
   1398    case OP_TXB:
   1399       return handleTXB(i->asTex());
   1400    case OP_TXL:
   1401       return handleTXL(i->asTex());
   1402    case OP_TXD:
   1403       return handleTXD(i->asTex());
   1404    case OP_TXLQ:
   1405       return handleTXLQ(i->asTex());
   1406    case OP_TXQ:
   1407       return handleTXQ(i->asTex());
   1408    case OP_EX2:
   1409       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
   1410       i->setSrc(0, i->getDef(0));
   1411       break;
   1412    case OP_SET:
   1413       return handleSET(i);
   1414    case OP_SLCT:
   1415       return handleSLCT(i->asCmp());
   1416    case OP_SELP:
   1417       return handleSELP(i);
   1418    case OP_POW:
   1419       return handlePOW(i);
   1420    case OP_DIV:
   1421       return handleDIV(i);
   1422    case OP_SQRT:
   1423       return handleSQRT(i);
   1424    case OP_EXPORT:
   1425       return handleEXPORT(i);
   1426    case OP_LOAD:
   1427       return handleLOAD(i);
   1428    case OP_RDSV:
   1429       return handleRDSV(i);
   1430    case OP_WRSV:
   1431       return handleWRSV(i);
   1432    case OP_CALL:
   1433       return handleCALL(i);
   1434    case OP_PRECONT:
   1435       return handlePRECONT(i);
   1436    case OP_CONT:
   1437       return handleCONT(i);
   1438    case OP_PFETCH:
   1439       return handlePFETCH(i);
   1440    default:
   1441       break;
   1442    }
   1443    return true;
   1444 }
   1445 
   1446 bool
   1447 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
   1448 {
   1449    bool ret = false;
   1450 
   1451    if (stage == CG_STAGE_PRE_SSA) {
   1452       NV50LoweringPreSSA pass(prog);
   1453       ret = pass.run(prog, false, true);
   1454    } else
   1455    if (stage == CG_STAGE_SSA) {
   1456       if (!prog->targetPriv)
   1457          prog->targetPriv = new std::list<Instruction *>();
   1458       NV50LegalizeSSA pass(prog);
   1459       ret = pass.run(prog, false, true);
   1460    } else
   1461    if (stage == CG_STAGE_POST_RA) {
   1462       NV50LegalizePostRA pass;
   1463       ret = pass.run(prog, false, true);
   1464       if (prog->targetPriv)
   1465          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
   1466    }
   1467    return ret;
   1468 }
   1469 
   1470 } // namespace nv50_ir
   1471