Home | History | Annotate | Download | only in codegen
      1 /*
      2  * Copyright 2011 Christoph Bumiller
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
     18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
     19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     20  * SOFTWARE.
     21  */
     22 
     23 #include "nv50/codegen/nv50_ir.h"
     24 #include "nv50/codegen/nv50_ir_build_util.h"
     25 
     26 #include "nv50_ir_target_nv50.h"
     27 
     28 namespace nv50_ir {
     29 
     30 // nv50 doesn't support 32 bit integer multiplication
     31 //
     32 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
     33 // -------------------
     34 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
     35 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
     36 //       al*bl
     37 //    ah*bl 00
     38 //
     39 // fffe0001 + fffe0001
     40 static bool
     41 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
     42 {
     43    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
     44 
     45    DataType fTy = mul->sType; // full type
     46    DataType hTy;
     47    switch (fTy) {
     48    case TYPE_S32: hTy = TYPE_S16; break;
     49    case TYPE_U32: hTy = TYPE_U16; break;
     50    case TYPE_U64: hTy = TYPE_U32; break;
     51    case TYPE_S64: hTy = TYPE_S32; break;
     52    default:
     53       return false;
     54    }
     55    unsigned int fullSize = typeSizeof(fTy);
     56    unsigned int halfSize = typeSizeof(hTy);
     57 
     58    Instruction *i[9];
     59 
     60    bld->setPosition(mul, true);
     61 
     62    Value *a[2], *b[2];
     63    Value *c[2];
     64    Value *t[4];
     65    for (int j = 0; j < 4; ++j)
     66       t[j] = bld->getSSA(fullSize);
     67 
     68    // split sources into halves
     69    i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
     70    i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
     71 
     72    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
     73    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
     74    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
     75    i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
     76 
     77    if (highResult) {
     78       Value *r[3];
     79       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
     80       c[0] = bld->getSSA(1, FILE_FLAGS);
     81       c[1] = bld->getSSA(1, FILE_FLAGS);
     82       for (int j = 0; j < 3; ++j)
     83          r[j] = bld->getSSA(fullSize);
     84 
     85       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
     86       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
     87       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
     88       i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
     89 
     90       // set carry defs / sources
     91       i[3]->setFlagsDef(1, c[0]);
     92       i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
     93       i[6]->setPredicate(CC_C, c[0]);
     94       i[5]->setFlagsSrc(3, c[1]);
     95    } else {
     96       bld->mkMov(mul->getDef(0), t[3]);
     97    }
     98    delete_Instruction(bld->getProgram(), mul);
     99 
    100    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
    101       if (i[j])
    102          i[j]->sType = hTy;
    103 
    104    return true;
    105 }
    106 
    107 #define QOP_ADD  0
    108 #define QOP_SUBR 1
    109 #define QOP_SUB  2
    110 #define QOP_MOV2 3
    111 
    112 //             UL UR LL LR
    113 #define QUADOP(q, r, s, t)            \
    114    ((QOP_##q << 6) | (QOP_##r << 4) | \
    115     (QOP_##s << 2) | (QOP_##t << 0))
    116 
    117 class NV50LegalizePostRA : public Pass
    118 {
    119 private:
    120    virtual bool visit(Function *);
    121    virtual bool visit(BasicBlock *);
    122 
    123    void handlePRERET(FlowInstruction *);
    124    void replaceZero(Instruction *);
    125    void split64BitOp(Instruction *);
    126 
    127    LValue *r63;
    128 };
    129 
    130 bool
    131 NV50LegalizePostRA::visit(Function *fn)
    132 {
    133    Program *prog = fn->getProgram();
    134 
    135    r63 = new_LValue(fn, FILE_GPR);
    136    r63->reg.data.id = 63;
    137 
    138    // this is actually per-program, but we can do it all on visiting main()
    139    std::list<Instruction *> *outWrites =
    140       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
    141 
    142    if (outWrites) {
    143       for (std::list<Instruction *>::iterator it = outWrites->begin();
    144            it != outWrites->end(); ++it)
    145          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
    146       // instructions will be deleted on exit
    147       outWrites->clear();
    148    }
    149 
    150    return true;
    151 }
    152 
    153 void
    154 NV50LegalizePostRA::replaceZero(Instruction *i)
    155 {
    156    for (int s = 0; i->srcExists(s); ++s) {
    157       ImmediateValue *imm = i->getSrc(s)->asImm();
    158       if (imm && imm->reg.data.u64 == 0)
    159          i->setSrc(s, r63);
    160    }
    161 }
    162 
    163 void
    164 NV50LegalizePostRA::split64BitOp(Instruction *i)
    165 {
    166    if (i->dType == TYPE_F64) {
    167       if (i->op == OP_MAD)
    168          i->op = OP_FMA;
    169       if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
    170           i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
    171           i->op == OP_SET)
    172          return;
    173       i->dType = i->sType = TYPE_U32;
    174 
    175       i->bb->insertAfter(i, cloneForward(func, i));
    176    }
    177 }
    178 
    179 // Emulate PRERET: jump to the target and call to the origin from there
    180 //
    181 // WARNING: atm only works if BBs are affected by at most a single PRERET
    182 //
    183 // BB:0
    184 // preret BB:3
    185 // (...)
    186 // BB:3
    187 // (...)
    188 //             --->
    189 // BB:0
    190 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
    191 // (...)
    192 // BB:3
    193 // bra BB:3 + n1 (skip the call)
    194 // call BB:0 + n2 (skip bra at beginning of BB:0)
    195 // (...)
    196 void
    197 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
    198 {
    199    BasicBlock *bbE = pre->bb;
    200    BasicBlock *bbT = pre->target.bb;
    201 
    202    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
    203    bbE->remove(pre);
    204    bbE->insertHead(pre);
    205 
    206    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
    207    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
    208 
    209    bbT->insertHead(call);
    210    bbT->insertHead(skip);
    211 
    212    // NOTE: maybe split blocks to prevent the instructions from moving ?
    213 
    214    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
    215    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
    216 }
    217 
    218 bool
    219 NV50LegalizePostRA::visit(BasicBlock *bb)
    220 {
    221    Instruction *i, *next;
    222 
    223    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
    224    for (i = bb->getFirst(); i; i = next) {
    225       next = i->next;
    226       if (i->isNop()) {
    227          bb->remove(i);
    228       } else
    229       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
    230          handlePRERET(i->asFlow());
    231       } else {
    232          if (i->op != OP_MOV && i->op != OP_PFETCH &&
    233              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
    234             replaceZero(i);
    235          if (typeSizeof(i->dType) == 8)
    236             split64BitOp(i);
    237       }
    238    }
    239    if (!bb->getEntry())
    240       return true;
    241 
    242    return true;
    243 }
    244 
    245 class NV50LegalizeSSA : public Pass
    246 {
    247 public:
    248    NV50LegalizeSSA(Program *);
    249 
    250    virtual bool visit(BasicBlock *bb);
    251 
    252 private:
    253    void propagateWriteToOutput(Instruction *);
    254    void handleDIV(Instruction *);
    255    void handleMOD(Instruction *);
    256    void handleMUL(Instruction *);
    257    void handleAddrDef(Instruction *);
    258 
    259    inline bool isARL(const Instruction *) const;
    260 
    261    BuildUtil bld;
    262 
    263    std::list<Instruction *> *outWrites;
    264 };
    265 
    266 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
    267 {
    268    bld.setProgram(prog);
    269 
    270    if (prog->optLevel >= 2 &&
    271        (prog->getType() == Program::TYPE_GEOMETRY ||
    272         prog->getType() == Program::TYPE_VERTEX))
    273       outWrites =
    274          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
    275    else
    276       outWrites = NULL;
    277 }
    278 
    279 void
    280 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
    281 {
    282    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
    283       return;
    284 
    285    // check def instruction can store
    286    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
    287 
    288    // TODO: move exports (if beneficial) in common opt pass
    289    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
    290       return;
    291    for (int s = 0; di->srcExists(s); ++s)
    292       if (di->src(s).getFile() == FILE_IMMEDIATE)
    293          return;
    294 
    295    // We cannot set defs to non-lvalues before register allocation, so
    296    // save & remove (to save registers) the exports and replace later.
    297    outWrites->push_back(st);
    298    st->bb->remove(st);
    299 }
    300 
    301 bool
    302 NV50LegalizeSSA::isARL(const Instruction *i) const
    303 {
    304    ImmediateValue imm;
    305 
    306    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
    307       return false;
    308    if (!i->src(1).getImmediate(imm))
    309       return false;
    310    return imm.isInteger(0);
    311 }
    312 
    313 void
    314 NV50LegalizeSSA::handleAddrDef(Instruction *i)
    315 {
    316    Instruction *arl;
    317 
    318    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
    319 
    320    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
    321    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
    322       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
    323          return;
    324       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
    325          return;
    326    }
    327 
    328    // turn $a sources into $r sources (can't operate on $a)
    329    for (int s = 0; i->srcExists(s); ++s) {
    330       Value *a = i->getSrc(s);
    331       Value *r;
    332       if (a->reg.file == FILE_ADDRESS) {
    333          if (a->getInsn() && isARL(a->getInsn())) {
    334             i->setSrc(s, a->getInsn()->getSrc(0));
    335          } else {
    336             bld.setPosition(i, false);
    337             r = bld.getSSA();
    338             bld.mkMov(r, a);
    339             i->setSrc(s, r);
    340          }
    341       }
    342    }
    343    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
    344       return;
    345 
    346    // turn result back into $a
    347    bld.setPosition(i, true);
    348    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
    349    i->setDef(0, arl->getSrc(0));
    350 }
    351 
    352 void
    353 NV50LegalizeSSA::handleMUL(Instruction *mul)
    354 {
    355    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
    356       return;
    357    Value *def = mul->getDef(0);
    358    Value *pred = mul->getPredicate();
    359    CondCode cc = mul->cc;
    360    if (pred)
    361       mul->setPredicate(CC_ALWAYS, NULL);
    362 
    363    if (mul->op == OP_MAD) {
    364       Instruction *add = mul;
    365       bld.setPosition(add, false);
    366       Value *res = cloneShallow(func, mul->getDef(0));
    367       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
    368       add->op = OP_ADD;
    369       add->setSrc(0, mul->getDef(0));
    370       add->setSrc(1, add->getSrc(2));
    371       for (int s = 2; add->srcExists(s); ++s)
    372          add->setSrc(s, NULL);
    373       mul->subOp = add->subOp;
    374       add->subOp = 0;
    375    }
    376    expandIntegerMUL(&bld, mul);
    377    if (pred)
    378       def->getInsn()->setPredicate(cc, pred);
    379 }
    380 
    381 // Use f32 division: first compute an approximate result, use it to reduce
    382 // the dividend, which should then be representable as f32, divide the reduced
    383 // dividend, and add the quotients.
    384 void
    385 NV50LegalizeSSA::handleDIV(Instruction *div)
    386 {
    387    const DataType ty = div->sType;
    388 
    389    if (ty != TYPE_U32 && ty != TYPE_S32)
    390       return;
    391 
    392    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
    393 
    394    bld.setPosition(div, false);
    395 
    396    Value *a, *af = bld.getSSA();
    397    Value *b, *bf = bld.getSSA();
    398 
    399    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
    400    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
    401 
    402    if (isSignedType(ty)) {
    403       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
    404       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
    405       a = bld.getSSA();
    406       b = bld.getSSA();
    407       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
    408       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
    409    } else {
    410       a = div->getSrc(0);
    411       b = div->getSrc(1);
    412    }
    413 
    414    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
    415    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
    416 
    417    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
    418    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
    419 
    420    // get error of 1st result
    421    expandIntegerMUL(&bld,
    422       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
    423    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
    424 
    425    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
    426 
    427    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
    428    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
    429       ->rnd = ROUND_Z;
    430    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
    431 
    432    // correction: if modulus >= divisor, add 1
    433    expandIntegerMUL(&bld,
    434       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
    435    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
    436    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
    437    if (!isSignedType(ty)) {
    438       div->op = OP_SUB;
    439       div->setSrc(0, q);
    440       div->setSrc(1, s);
    441    } else {
    442       t = q;
    443       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
    444       s = bld.getSSA();
    445       t = bld.getSSA();
    446       // fix the sign
    447       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
    448          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
    449       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
    450       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
    451 
    452       div->op = OP_UNION;
    453       div->setSrc(0, s);
    454       div->setSrc(1, t);
    455    }
    456 }
    457 
    458 void
    459 NV50LegalizeSSA::handleMOD(Instruction *mod)
    460 {
    461    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
    462       return;
    463    bld.setPosition(mod, false);
    464 
    465    Value *q = bld.getSSA();
    466    Value *m = bld.getSSA();
    467 
    468    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
    469    handleDIV(q->getInsn());
    470 
    471    bld.setPosition(mod, false);
    472    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
    473 
    474    mod->op = OP_SUB;
    475    mod->setSrc(1, m);
    476 }
    477 
    478 bool
    479 NV50LegalizeSSA::visit(BasicBlock *bb)
    480 {
    481    Instruction *insn, *next;
    482    // skipping PHIs (don't pass them to handleAddrDef) !
    483    for (insn = bb->getEntry(); insn; insn = next) {
    484       next = insn->next;
    485 
    486       switch (insn->op) {
    487       case OP_EXPORT:
    488          if (outWrites)
    489             propagateWriteToOutput(insn);
    490          break;
    491       case OP_DIV:
    492          handleDIV(insn);
    493          break;
    494       case OP_MOD:
    495          handleMOD(insn);
    496          break;
    497       case OP_MAD:
    498       case OP_MUL:
    499          handleMUL(insn);
    500          break;
    501       default:
    502          break;
    503       }
    504 
    505       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
    506          handleAddrDef(insn);
    507    }
    508    return true;
    509 }
    510 
    511 class NV50LoweringPreSSA : public Pass
    512 {
    513 public:
    514    NV50LoweringPreSSA(Program *);
    515 
    516 private:
    517    virtual bool visit(Instruction *);
    518    virtual bool visit(Function *);
    519 
    520    bool handleRDSV(Instruction *);
    521    bool handleWRSV(Instruction *);
    522 
    523    bool handleEXPORT(Instruction *);
    524 
    525    bool handleDIV(Instruction *);
    526    bool handleSQRT(Instruction *);
    527    bool handlePOW(Instruction *);
    528 
    529    bool handleSET(Instruction *);
    530    bool handleSLCT(CmpInstruction *);
    531    bool handleSELP(Instruction *);
    532 
    533    bool handleTEX(TexInstruction *);
    534    bool handleTXB(TexInstruction *); // I really
    535    bool handleTXL(TexInstruction *); // hate
    536    bool handleTXD(TexInstruction *); // these 3
    537 
    538    bool handleCALL(Instruction *);
    539    bool handlePRECONT(Instruction *);
    540    bool handleCONT(Instruction *);
    541 
    542    void checkPredicate(Instruction *);
    543 
    544 private:
    545    const Target *const targ;
    546 
    547    BuildUtil bld;
    548 
    549    Value *tid;
    550 };
    551 
    552 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
    553    targ(prog->getTarget()), tid(NULL)
    554 {
    555    bld.setProgram(prog);
    556 }
    557 
    558 bool
    559 NV50LoweringPreSSA::visit(Function *f)
    560 {
    561    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
    562 
    563    if (prog->getType() == Program::TYPE_COMPUTE) {
    564       // Add implicit "thread id" argument in $r0 to the function
    565       Value *arg = new_LValue(func, FILE_GPR);
    566       arg->reg.data.id = 0;
    567       f->ins.push_back(arg);
    568 
    569       bld.setPosition(root, false);
    570       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
    571    }
    572 
    573    return true;
    574 }
    575 
    576 // move array source to first slot, convert to u16, add indirections
    577 bool
    578 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
    579 {
    580    const int arg = i->tex.target.getArgCount();
    581    const int dref = arg;
    582    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
    583 
    584    // dref comes before bias/lod
    585    if (i->tex.target.isShadow())
    586       if (i->op == OP_TXB || i->op == OP_TXL)
    587          i->swapSources(dref, lod);
    588 
    589    // array index must be converted to u32
    590    if (i->tex.target.isArray()) {
    591       Value *layer = i->getSrc(arg - 1);
    592       LValue *src = new_LValue(func, FILE_GPR);
    593       bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
    594       bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
    595       i->setSrc(arg - 1, src);
    596 
    597       if (i->tex.target.isCube()) {
    598          // Value *face = layer;
    599          Value *x, *y;
    600          x = new_LValue(func, FILE_GPR);
    601          y = new_LValue(func, FILE_GPR);
    602          layer = new_LValue(func, FILE_GPR);
    603 
    604          i->tex.target = TEX_TARGET_2D_ARRAY;
    605 
    606          // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer
    607          bld.mkMov(x, i->getSrc(0));
    608          bld.mkMov(y, i->getSrc(1));
    609          bld.mkMov(layer, i->getSrc(3));
    610 
    611          i->setSrc(0, x);
    612          i->setSrc(1, y);
    613          i->setSrc(2, layer);
    614          i->setSrc(3, i->getSrc(4));
    615          i->setSrc(4, NULL);
    616       }
    617    }
    618 
    619    // texel offsets are 3 immediate fields in the instruction,
    620    // nv50 cannot do textureGatherOffsets
    621    assert(i->tex.useOffsets <= 1);
    622 
    623    return true;
    624 }
    625 
    626 // Bias must be equal for all threads of a quad or lod calculation will fail.
    627 //
    628 // The lanes of a quad are grouped by the bit in the condition register they
    629 // have set, which is selected by differing bias values.
    630 // Move the input values for TEX into a new register set for each group and
    631 // execute TEX only for a specific group.
    632 // We always need to use 4 new registers for the inputs/outputs because the
    633 // implicitly calculated derivatives must be correct.
    634 //
    635 // TODO: move to SSA phase so we can easily determine whether bias is constant
    636 bool
    637 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
    638 {
    639    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
    640    int l, d;
    641 
    642    handleTEX(i);
    643    Value *bias = i->getSrc(i->tex.target.getArgCount());
    644    if (bias->isUniform())
    645       return true;
    646 
    647    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
    648                                  bld.loadImm(NULL, 1));
    649    bld.setPosition(cond, false);
    650 
    651    for (l = 1; l < 4; ++l) {
    652       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
    653       Value *bit = bld.getSSA();
    654       Value *pred = bld.getScratch(1, FILE_FLAGS);
    655       Value *imm = bld.loadImm(NULL, (1 << l));
    656       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
    657       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
    658       cond->setSrc(l, bit);
    659    }
    660    Value *flags = bld.getScratch(1, FILE_FLAGS);
    661    bld.setPosition(cond, true);
    662    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
    663 
    664    Instruction *tex[4];
    665    for (l = 0; l < 4; ++l) {
    666       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
    667       bld.insert(tex[l]);
    668    }
    669 
    670    Value *res[4][4];
    671    for (d = 0; i->defExists(d); ++d)
    672       res[0][d] = tex[0]->getDef(d);
    673    for (l = 1; l < 4; ++l) {
    674       for (d = 0; tex[l]->defExists(d); ++d) {
    675          res[l][d] = cloneShallow(func, res[0][d]);
    676          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
    677       }
    678    }
    679 
    680    for (d = 0; i->defExists(d); ++d) {
    681       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
    682       for (l = 0; l < 4; ++l)
    683          dst->setSrc(l, res[l][d]);
    684    }
    685    delete_Instruction(prog, i);
    686    return true;
    687 }
    688 
    689 // LOD must be equal for all threads of a quad.
    690 // Unlike with TXB, here we can just diverge since there's no LOD calculation
    691 // that would require all 4 threads' sources to be set up properly.
    692 bool
    693 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
    694 {
    695    handleTEX(i);
    696    Value *lod = i->getSrc(i->tex.target.getArgCount());
    697    if (lod->isUniform())
    698       return true;
    699 
    700    BasicBlock *currBB = i->bb;
    701    BasicBlock *texiBB = i->bb->splitBefore(i, false);
    702    BasicBlock *joinBB = i->bb->splitAfter(i);
    703 
    704    bld.setPosition(currBB, true);
    705    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
    706 
    707    for (int l = 0; l <= 3; ++l) {
    708       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
    709       Value *pred = bld.getScratch(1, FILE_FLAGS);
    710       bld.setPosition(currBB, true);
    711       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
    712       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
    713       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
    714       if (l <= 2) {
    715          BasicBlock *laneBB = new BasicBlock(func);
    716          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
    717          currBB = laneBB;
    718       }
    719    }
    720    bld.setPosition(joinBB, false);
    721    bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
    722    return true;
    723 }
    724 
    725 bool
    726 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
    727 {
    728    static const uint8_t qOps[4][2] =
    729    {
    730       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
    731       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
    732       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
    733       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
    734    };
    735    Value *def[4][4];
    736    Value *crd[3];
    737    Instruction *tex;
    738    Value *zero = bld.loadImm(bld.getSSA(), 0);
    739    int l, c;
    740    const int dim = i->tex.target.getDim();
    741 
    742    handleTEX(i);
    743    i->op = OP_TEX; // no need to clone dPdx/dPdy later
    744 
    745    for (c = 0; c < dim; ++c)
    746       crd[c] = bld.getScratch();
    747 
    748    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
    749    for (l = 0; l < 4; ++l) {
    750       // mov coordinates from lane l to all lanes
    751       for (c = 0; c < dim; ++c)
    752          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
    753       // add dPdx from lane l to lanes dx
    754       for (c = 0; c < dim; ++c)
    755          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
    756       // add dPdy from lane l to lanes dy
    757       for (c = 0; c < dim; ++c)
    758          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
    759       // texture
    760       bld.insert(tex = cloneForward(func, i));
    761       for (c = 0; c < dim; ++c)
    762          tex->setSrc(c, crd[c]);
    763       // save results
    764       for (c = 0; i->defExists(c); ++c) {
    765          Instruction *mov;
    766          def[c][l] = bld.getSSA();
    767          mov = bld.mkMov(def[c][l], tex->getDef(c));
    768          mov->fixed = 1;
    769          mov->lanes = 1 << l;
    770       }
    771    }
    772    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
    773 
    774    for (c = 0; i->defExists(c); ++c) {
    775       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
    776       for (l = 0; l < 4; ++l)
    777          u->setSrc(l, def[c][l]);
    778    }
    779 
    780    i->bb->remove(i);
    781    return true;
    782 }
    783 
    784 bool
    785 NV50LoweringPreSSA::handleSET(Instruction *i)
    786 {
    787    if (i->dType == TYPE_F32) {
    788       bld.setPosition(i, true);
    789       i->dType = TYPE_U32;
    790       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
    791       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
    792    }
    793    return true;
    794 }
    795 
    796 bool
    797 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
    798 {
    799    Value *src0 = bld.getSSA();
    800    Value *src1 = bld.getSSA();
    801    Value *pred = bld.getScratch(1, FILE_FLAGS);
    802 
    803    Value *v0 = i->getSrc(0);
    804    Value *v1 = i->getSrc(1);
    805    // XXX: these probably shouldn't be immediates in the first place ...
    806    if (v0->asImm())
    807       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
    808    if (v1->asImm())
    809       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
    810 
    811    bld.setPosition(i, true);
    812    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
    813    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
    814    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
    815 
    816    bld.setPosition(i, false);
    817    i->op = OP_SET;
    818    i->setFlagsDef(0, pred);
    819    i->dType = TYPE_U8;
    820    i->setSrc(0, i->getSrc(2));
    821    i->setSrc(2, NULL);
    822    i->setSrc(1, bld.loadImm(NULL, 0));
    823 
    824    return true;
    825 }
    826 
    827 bool
    828 NV50LoweringPreSSA::handleSELP(Instruction *i)
    829 {
    830    Value *src0 = bld.getSSA();
    831    Value *src1 = bld.getSSA();
    832 
    833    Value *v0 = i->getSrc(0);
    834    Value *v1 = i->getSrc(1);
    835    if (v0->asImm())
    836       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
    837    if (v1->asImm())
    838       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
    839 
    840    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
    841    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
    842    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
    843    delete_Instruction(prog, i);
    844    return true;
    845 }
    846 
    847 bool
    848 NV50LoweringPreSSA::handleWRSV(Instruction *i)
    849 {
    850    Symbol *sym = i->getSrc(0)->asSym();
    851 
    852    // these are all shader outputs, $sreg are not writeable
    853    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
    854    if (addr >= 0x400)
    855       return false;
    856    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
    857 
    858    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
    859 
    860    bld.getBB()->remove(i);
    861    return true;
    862 }
    863 
    864 bool
    865 NV50LoweringPreSSA::handleCALL(Instruction *i)
    866 {
    867    if (prog->getType() == Program::TYPE_COMPUTE) {
    868       // Add implicit "thread id" argument in $r0 to the function
    869       i->setSrc(i->srcCount(), tid);
    870    }
    871    return true;
    872 }
    873 
    874 bool
    875 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
    876 {
    877    delete_Instruction(prog, i);
    878    return true;
    879 }
    880 
    881 bool
    882 NV50LoweringPreSSA::handleCONT(Instruction *i)
    883 {
    884    i->op = OP_BRA;
    885    return true;
    886 }
    887 
    888 bool
    889 NV50LoweringPreSSA::handleRDSV(Instruction *i)
    890 {
    891    Symbol *sym = i->getSrc(0)->asSym();
    892    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
    893    Value *def = i->getDef(0);
    894    SVSemantic sv = sym->reg.data.sv.sv;
    895    int idx = sym->reg.data.sv.index;
    896 
    897    if (addr >= 0x400) // mov $sreg
    898       return true;
    899 
    900    switch (sv) {
    901    case SV_POSITION:
    902       assert(prog->getType() == Program::TYPE_FRAGMENT);
    903       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
    904       break;
    905    case SV_FACE:
    906       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
    907       if (i->dType == TYPE_F32) {
    908          bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
    909          bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
    910       }
    911       break;
    912    case SV_NCTAID:
    913    case SV_CTAID:
    914    case SV_NTID:
    915       if ((sv == SV_NCTAID && idx >= 2) ||
    916           (sv == SV_NTID && idx >= 3)) {
    917          bld.mkMov(def, bld.mkImm(1));
    918       } else if (sv == SV_CTAID && idx >= 2) {
    919          bld.mkMov(def, bld.mkImm(0));
    920       } else {
    921          Value *x = bld.getSSA(2);
    922          bld.mkOp1(OP_LOAD, TYPE_U16, x,
    923                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
    924          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
    925       }
    926       break;
    927    case SV_TID:
    928       if (idx == 0) {
    929          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
    930       } else if (idx == 1) {
    931          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
    932          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
    933       } else if (idx == 2) {
    934          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
    935       } else {
    936          bld.mkMov(def, bld.mkImm(0));
    937       }
    938       break;
    939    default:
    940       bld.mkFetch(i->getDef(0), i->dType,
    941                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
    942       break;
    943    }
    944    bld.getBB()->remove(i);
    945    return true;
    946 }
    947 
    948 bool
    949 NV50LoweringPreSSA::handleDIV(Instruction *i)
    950 {
    951    if (!isFloatType(i->dType))
    952       return true;
    953    bld.setPosition(i, false);
    954    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
    955    i->op = OP_MUL;
    956    i->setSrc(1, rcp->getDef(0));
    957    return true;
    958 }
    959 
    960 bool
    961 NV50LoweringPreSSA::handleSQRT(Instruction *i)
    962 {
    963    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
    964                                 bld.getSSA(), i->getSrc(0));
    965    i->op = OP_MUL;
    966    i->setSrc(1, rsq->getDef(0));
    967 
    968    return true;
    969 }
    970 
    971 bool
    972 NV50LoweringPreSSA::handlePOW(Instruction *i)
    973 {
    974    LValue *val = bld.getScratch();
    975 
    976    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
    977    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
    978    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
    979 
    980    i->op = OP_EX2;
    981    i->setSrc(0, val);
    982    i->setSrc(1, NULL);
    983 
    984    return true;
    985 }
    986 
    987 bool
    988 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
    989 {
    990    if (prog->getType() == Program::TYPE_FRAGMENT) {
    991       if (i->getIndirect(0, 0)) {
    992          // TODO: redirect to l[] here, load to GPRs at exit
    993          return false;
    994       } else {
    995          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
    996 
    997          i->op = OP_MOV;
    998          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
    999          i->src(0).set(i->src(1));
   1000          i->setSrc(1, NULL);
   1001          i->setDef(0, new_LValue(func, FILE_GPR));
   1002          i->getDef(0)->reg.data.id = id;
   1003 
   1004          prog->maxGPR = MAX2(prog->maxGPR, id);
   1005       }
   1006    }
   1007    return true;
   1008 }
   1009 
   1010 // Set flags according to predicate and make the instruction read $cX.
   1011 void
   1012 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
   1013 {
   1014    Value *pred = insn->getPredicate();
   1015    Value *cdst;
   1016 
   1017    if (!pred || pred->reg.file == FILE_FLAGS)
   1018       return;
   1019    cdst = bld.getSSA(1, FILE_FLAGS);
   1020 
   1021    bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred);
   1022 
   1023    insn->setPredicate(insn->cc, cdst);
   1024 }
   1025 
   1026 //
   1027 // - add quadop dance for texturing
   1028 // - put FP outputs in GPRs
   1029 // - convert instruction sequences
   1030 //
   1031 bool
   1032 NV50LoweringPreSSA::visit(Instruction *i)
   1033 {
   1034    bld.setPosition(i, false);
   1035 
   1036    if (i->cc != CC_ALWAYS)
   1037       checkPredicate(i);
   1038 
   1039    switch (i->op) {
   1040    case OP_TEX:
   1041    case OP_TXF:
   1042    case OP_TXG:
   1043       return handleTEX(i->asTex());
   1044    case OP_TXB:
   1045       return handleTXB(i->asTex());
   1046    case OP_TXL:
   1047       return handleTXL(i->asTex());
   1048    case OP_TXD:
   1049       return handleTXD(i->asTex());
   1050    case OP_EX2:
   1051       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
   1052       i->setSrc(0, i->getDef(0));
   1053       break;
   1054    case OP_SET:
   1055       return handleSET(i);
   1056    case OP_SLCT:
   1057       return handleSLCT(i->asCmp());
   1058    case OP_SELP:
   1059       return handleSELP(i);
   1060    case OP_POW:
   1061       return handlePOW(i);
   1062    case OP_DIV:
   1063       return handleDIV(i);
   1064    case OP_SQRT:
   1065       return handleSQRT(i);
   1066    case OP_EXPORT:
   1067       return handleEXPORT(i);
   1068    case OP_RDSV:
   1069       return handleRDSV(i);
   1070    case OP_WRSV:
   1071       return handleWRSV(i);
   1072    case OP_CALL:
   1073       return handleCALL(i);
   1074    case OP_PRECONT:
   1075       return handlePRECONT(i);
   1076    case OP_CONT:
   1077       return handleCONT(i);
   1078    default:
   1079       break;
   1080    }
   1081    return true;
   1082 }
   1083 
   1084 bool
   1085 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
   1086 {
   1087    bool ret = false;
   1088 
   1089    if (stage == CG_STAGE_PRE_SSA) {
   1090       NV50LoweringPreSSA pass(prog);
   1091       ret = pass.run(prog, false, true);
   1092    } else
   1093    if (stage == CG_STAGE_SSA) {
   1094       if (!prog->targetPriv)
   1095          prog->targetPriv = new std::list<Instruction *>();
   1096       NV50LegalizeSSA pass(prog);
   1097       ret = pass.run(prog, false, true);
   1098    } else
   1099    if (stage == CG_STAGE_POST_RA) {
   1100       NV50LegalizePostRA pass;
   1101       ret = pass.run(prog, false, true);
   1102       if (prog->targetPriv)
   1103          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
   1104    }
   1105    return ret;
   1106 }
   1107 
   1108 } // namespace nv50_ir
   1109