Home | History | Annotate | Download | only in codegen
      1 /*
      2  * Copyright 2011 Christoph Bumiller
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "codegen/nv50_ir.h"
     24 #include "codegen/nv50_ir_target.h"
     25 #include "codegen/nv50_ir_build_util.h"
     26 
     27 extern "C" {
     28 #include "util/u_math.h"
     29 }
     30 
     31 namespace nv50_ir {
     32 
     33 bool
     34 Instruction::isNop() const
     35 {
     36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
     37       return true;
     38    if (terminator || join) // XXX: should terminator imply flow ?
     39       return false;
     40    if (op == OP_ATOM)
     41       return false;
     42    if (!fixed && op == OP_NOP)
     43       return true;
     44 
     45    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
     46       for (int d = 1; defExists(d); ++d)
     47          if (def(d).rep()->reg.data.id >= 0)
     48             WARN("part of vector result is unused !\n");
     49       return true;
     50    }
     51 
     52    if (op == OP_MOV || op == OP_UNION) {
     53       if (!getDef(0)->equals(getSrc(0)))
     54          return false;
     55       if (op == OP_UNION)
     56          if (!def(0).rep()->equals(getSrc(1)))
     57             return false;
     58       return true;
     59    }
     60 
     61    return false;
     62 }
     63 
     64 bool Instruction::isDead() const
     65 {
     66    if (op == OP_STORE ||
     67        op == OP_EXPORT ||
     68        op == OP_ATOM ||
     69        op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
     70        op == OP_WRSV)
     71       return false;
     72 
     73    for (int d = 0; defExists(d); ++d)
     74       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
     75          return false;
     76 
     77    if (terminator || asFlow())
     78       return false;
     79    if (fixed)
     80       return false;
     81 
     82    return true;
     83 };
     84 
     85 // =============================================================================
     86 
     87 class CopyPropagation : public Pass
     88 {
     89 private:
     90    virtual bool visit(BasicBlock *);
     91 };
     92 
     93 // Propagate all MOVs forward to make subsequent optimization easier, except if
     94 // the sources stem from a phi, in which case we don't want to mess up potential
     95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
     96 bool
     97 CopyPropagation::visit(BasicBlock *bb)
     98 {
     99    Instruction *mov, *si, *next;
    100 
    101    for (mov = bb->getEntry(); mov; mov = next) {
    102       next = mov->next;
    103       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
    104          continue;
    105       if (mov->getPredicate())
    106          continue;
    107       if (mov->def(0).getFile() != mov->src(0).getFile())
    108          continue;
    109       si = mov->getSrc(0)->getInsn();
    110       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
    111          // propagate
    112          mov->def(0).replace(mov->getSrc(0), false);
    113          delete_Instruction(prog, mov);
    114       }
    115    }
    116    return true;
    117 }
    118 
    119 // =============================================================================
    120 
    121 class MergeSplits : public Pass
    122 {
    123 private:
    124    virtual bool visit(BasicBlock *);
    125 };
    126 
    127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
    128 // post-merge def with the SPLIT's source.
    129 bool
    130 MergeSplits::visit(BasicBlock *bb)
    131 {
    132    Instruction *i, *next, *si;
    133 
    134    for (i = bb->getEntry(); i; i = next) {
    135       next = i->next;
    136       if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
    137          continue;
    138       si = i->getSrc(0)->getInsn();
    139       if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
    140          continue;
    141       i->def(0).replace(si->getSrc(0), false);
    142       delete_Instruction(prog, i);
    143    }
    144 
    145    return true;
    146 }
    147 
    148 // =============================================================================
    149 
    150 class LoadPropagation : public Pass
    151 {
    152 private:
    153    virtual bool visit(BasicBlock *);
    154 
    155    void checkSwapSrc01(Instruction *);
    156 
    157    bool isCSpaceLoad(Instruction *);
    158    bool isImmdLoad(Instruction *);
    159    bool isAttribOrSharedLoad(Instruction *);
    160 };
    161 
    162 bool
    163 LoadPropagation::isCSpaceLoad(Instruction *ld)
    164 {
    165    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
    166 }
    167 
    168 bool
    169 LoadPropagation::isImmdLoad(Instruction *ld)
    170 {
    171    if (!ld || (ld->op != OP_MOV) ||
    172        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
    173       return false;
    174 
    175    // A 0 can be replaced with a register, so it doesn't count as an immediate.
    176    ImmediateValue val;
    177    return ld->src(0).getImmediate(val) && !val.isInteger(0);
    178 }
    179 
    180 bool
    181 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
    182 {
    183    return ld &&
    184       (ld->op == OP_VFETCH ||
    185        (ld->op == OP_LOAD &&
    186         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
    187          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
    188 }
    189 
    190 void
    191 LoadPropagation::checkSwapSrc01(Instruction *insn)
    192 {
    193    const Target *targ = prog->getTarget();
    194    if (!targ->getOpInfo(insn).commutative)
    195       if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
    196          return;
    197    if (insn->src(1).getFile() != FILE_GPR)
    198       return;
    199    // This is the special OP_SET used for alphatesting, we can't reverse its
    200    // arguments as that will confuse the fixup code.
    201    if (insn->op == OP_SET && insn->subOp)
    202       return;
    203 
    204    Instruction *i0 = insn->getSrc(0)->getInsn();
    205    Instruction *i1 = insn->getSrc(1)->getInsn();
    206 
    207    // Swap sources to inline the less frequently used source. That way,
    208    // optimistically, it will eventually be able to remove the instruction.
    209    int i0refs = insn->getSrc(0)->refCount();
    210    int i1refs = insn->getSrc(1)->refCount();
    211 
    212    if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
    213       if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
    214           !targ->insnCanLoad(insn, 1, i1) ||
    215           i0refs < i1refs)
    216          insn->swapSources(0, 1);
    217       else
    218          return;
    219    } else
    220    if (isAttribOrSharedLoad(i1)) {
    221       if (!isAttribOrSharedLoad(i0))
    222          insn->swapSources(0, 1);
    223       else
    224          return;
    225    } else {
    226       return;
    227    }
    228 
    229    if (insn->op == OP_SET || insn->op == OP_SET_AND ||
    230        insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
    231       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
    232    else
    233    if (insn->op == OP_SLCT)
    234       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
    235    else
    236    if (insn->op == OP_SUB) {
    237       insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
    238       insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
    239    }
    240 }
    241 
    242 bool
    243 LoadPropagation::visit(BasicBlock *bb)
    244 {
    245    const Target *targ = prog->getTarget();
    246    Instruction *next;
    247 
    248    for (Instruction *i = bb->getEntry(); i; i = next) {
    249       next = i->next;
    250 
    251       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
    252          continue;
    253 
    254       if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
    255          continue;
    256 
    257       if (i->srcExists(1))
    258          checkSwapSrc01(i);
    259 
    260       for (int s = 0; i->srcExists(s); ++s) {
    261          Instruction *ld = i->getSrc(s)->getInsn();
    262 
    263          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
    264             continue;
    265          if (!targ->insnCanLoad(i, s, ld))
    266             continue;
    267 
    268          // propagate !
    269          i->setSrc(s, ld->getSrc(0));
    270          if (ld->src(0).isIndirect(0))
    271             i->setIndirect(s, 0, ld->getIndirect(0, 0));
    272 
    273          if (ld->getDef(0)->refCount() == 0)
    274             delete_Instruction(prog, ld);
    275       }
    276    }
    277    return true;
    278 }
    279 
    280 // =============================================================================
    281 
    282 class IndirectPropagation : public Pass
    283 {
    284 private:
    285    virtual bool visit(BasicBlock *);
    286 };
    287 
    288 bool
    289 IndirectPropagation::visit(BasicBlock *bb)
    290 {
    291    const Target *targ = prog->getTarget();
    292    Instruction *next;
    293 
    294    for (Instruction *i = bb->getEntry(); i; i = next) {
    295       next = i->next;
    296 
    297       for (int s = 0; i->srcExists(s); ++s) {
    298          Instruction *insn;
    299          ImmediateValue imm;
    300          if (!i->src(s).isIndirect(0))
    301             continue;
    302          insn = i->getIndirect(s, 0)->getInsn();
    303          if (!insn)
    304             continue;
    305          if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
    306             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
    307                 !insn->src(1).getImmediate(imm) ||
    308                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
    309                continue;
    310             i->setIndirect(s, 0, insn->getSrc(0));
    311             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
    312             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
    313          } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
    314             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
    315                 !insn->src(1).getImmediate(imm) ||
    316                 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
    317                continue;
    318             i->setIndirect(s, 0, insn->getSrc(0));
    319             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
    320             i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
    321          } else if (insn->op == OP_MOV) {
    322             if (!insn->src(0).getImmediate(imm) ||
    323                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
    324                continue;
    325             i->setIndirect(s, 0, NULL);
    326             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
    327             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
    328          }
    329       }
    330    }
    331    return true;
    332 }
    333 
    334 // =============================================================================
    335 
    336 // Evaluate constant expressions.
    337 class ConstantFolding : public Pass
    338 {
    339 public:
    340    bool foldAll(Program *);
    341 
    342 private:
    343    virtual bool visit(BasicBlock *);
    344 
    345    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
    346    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
    347    void opnd(Instruction *, ImmediateValue&, int s);
    348    void opnd3(Instruction *, ImmediateValue&);
    349 
    350    void unary(Instruction *, const ImmediateValue&);
    351 
    352    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
    353 
    354    CmpInstruction *findOriginForTestWithZero(Value *);
    355 
    356    unsigned int foldCount;
    357 
    358    BuildUtil bld;
    359 };
    360 
    361 // TODO: remember generated immediates and only revisit these
    362 bool
    363 ConstantFolding::foldAll(Program *prog)
    364 {
    365    unsigned int iterCount = 0;
    366    do {
    367       foldCount = 0;
    368       if (!run(prog))
    369          return false;
    370    } while (foldCount && ++iterCount < 2);
    371    return true;
    372 }
    373 
    374 bool
    375 ConstantFolding::visit(BasicBlock *bb)
    376 {
    377    Instruction *i, *next;
    378 
    379    for (i = bb->getEntry(); i; i = next) {
    380       next = i->next;
    381       if (i->op == OP_MOV || i->op == OP_CALL)
    382          continue;
    383 
    384       ImmediateValue src0, src1, src2;
    385 
    386       if (i->srcExists(2) &&
    387           i->src(0).getImmediate(src0) &&
    388           i->src(1).getImmediate(src1) &&
    389           i->src(2).getImmediate(src2))
    390          expr(i, src0, src1, src2);
    391       else
    392       if (i->srcExists(1) &&
    393           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
    394          expr(i, src0, src1);
    395       else
    396       if (i->srcExists(0) && i->src(0).getImmediate(src0))
    397          opnd(i, src0, 0);
    398       else
    399       if (i->srcExists(1) && i->src(1).getImmediate(src1))
    400          opnd(i, src1, 1);
    401       if (i->srcExists(2) && i->src(2).getImmediate(src2))
    402          opnd3(i, src2);
    403    }
    404    return true;
    405 }
    406 
    407 CmpInstruction *
    408 ConstantFolding::findOriginForTestWithZero(Value *value)
    409 {
    410    if (!value)
    411       return NULL;
    412    Instruction *insn = value->getInsn();
    413    if (!insn)
    414       return NULL;
    415 
    416    if (insn->asCmp() && insn->op != OP_SLCT)
    417       return insn->asCmp();
    418 
    419    /* Sometimes mov's will sneak in as a result of other folding. This gets
    420     * cleaned up later.
    421     */
    422    if (insn->op == OP_MOV)
    423       return findOriginForTestWithZero(insn->getSrc(0));
    424 
    425    /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
    426    if (insn->op == OP_AND) {
    427       int s = 0;
    428       ImmediateValue imm;
    429       if (!insn->src(s).getImmediate(imm)) {
    430          s = 1;
    431          if (!insn->src(s).getImmediate(imm))
    432             return NULL;
    433       }
    434       if (imm.reg.data.f32 != 1.0f)
    435          return NULL;
    436       /* TODO: Come up with a way to handle the condition being inverted */
    437       if (insn->src(!s).mod != Modifier(0))
    438          return NULL;
    439       return findOriginForTestWithZero(insn->getSrc(!s));
    440    }
    441 
    442    return NULL;
    443 }
    444 
    445 void
    446 Modifier::applyTo(ImmediateValue& imm) const
    447 {
    448    if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
    449       return;
    450    switch (imm.reg.type) {
    451    case TYPE_F32:
    452       if (bits & NV50_IR_MOD_ABS)
    453          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
    454       if (bits & NV50_IR_MOD_NEG)
    455          imm.reg.data.f32 = -imm.reg.data.f32;
    456       if (bits & NV50_IR_MOD_SAT) {
    457          if (imm.reg.data.f32 < 0.0f)
    458             imm.reg.data.f32 = 0.0f;
    459          else
    460          if (imm.reg.data.f32 > 1.0f)
    461             imm.reg.data.f32 = 1.0f;
    462       }
    463       assert(!(bits & NV50_IR_MOD_NOT));
    464       break;
    465 
    466    case TYPE_S8: // NOTE: will be extended
    467    case TYPE_S16:
    468    case TYPE_S32:
    469    case TYPE_U8: // NOTE: treated as signed
    470    case TYPE_U16:
    471    case TYPE_U32:
    472       if (bits & NV50_IR_MOD_ABS)
    473          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
    474             imm.reg.data.s32 : -imm.reg.data.s32;
    475       if (bits & NV50_IR_MOD_NEG)
    476          imm.reg.data.s32 = -imm.reg.data.s32;
    477       if (bits & NV50_IR_MOD_NOT)
    478          imm.reg.data.s32 = ~imm.reg.data.s32;
    479       break;
    480 
    481    case TYPE_F64:
    482       if (bits & NV50_IR_MOD_ABS)
    483          imm.reg.data.f64 = fabs(imm.reg.data.f64);
    484       if (bits & NV50_IR_MOD_NEG)
    485          imm.reg.data.f64 = -imm.reg.data.f64;
    486       if (bits & NV50_IR_MOD_SAT) {
    487          if (imm.reg.data.f64 < 0.0)
    488             imm.reg.data.f64 = 0.0;
    489          else
    490          if (imm.reg.data.f64 > 1.0)
    491             imm.reg.data.f64 = 1.0;
    492       }
    493       assert(!(bits & NV50_IR_MOD_NOT));
    494       break;
    495 
    496    default:
    497       assert(!"invalid/unhandled type");
    498       imm.reg.data.u64 = 0;
    499       break;
    500    }
    501 }
    502 
    503 operation
    504 Modifier::getOp() const
    505 {
    506    switch (bits) {
    507    case NV50_IR_MOD_ABS: return OP_ABS;
    508    case NV50_IR_MOD_NEG: return OP_NEG;
    509    case NV50_IR_MOD_SAT: return OP_SAT;
    510    case NV50_IR_MOD_NOT: return OP_NOT;
    511    case 0:
    512       return OP_MOV;
    513    default:
    514       return OP_CVT;
    515    }
    516 }
    517 
    518 void
    519 ConstantFolding::expr(Instruction *i,
    520                       ImmediateValue &imm0, ImmediateValue &imm1)
    521 {
    522    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
    523    struct Storage res;
    524    DataType type = i->dType;
    525 
    526    memset(&res.data, 0, sizeof(res.data));
    527 
    528    switch (i->op) {
    529    case OP_MAD:
    530    case OP_FMA:
    531    case OP_MUL:
    532       if (i->dnz && i->dType == TYPE_F32) {
    533          if (!isfinite(a->data.f32))
    534             a->data.f32 = 0.0f;
    535          if (!isfinite(b->data.f32))
    536             b->data.f32 = 0.0f;
    537       }
    538       switch (i->dType) {
    539       case TYPE_F32:
    540          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
    541          break;
    542       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
    543       case TYPE_S32:
    544          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    545             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
    546             break;
    547          }
    548          /* fallthrough */
    549       case TYPE_U32:
    550          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    551             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
    552             break;
    553          }
    554          res.data.u32 = a->data.u32 * b->data.u32; break;
    555       default:
    556          return;
    557       }
    558       break;
    559    case OP_DIV:
    560       if (b->data.u32 == 0)
    561          break;
    562       switch (i->dType) {
    563       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
    564       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
    565       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
    566       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
    567       default:
    568          return;
    569       }
    570       break;
    571    case OP_ADD:
    572       switch (i->dType) {
    573       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
    574       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
    575       case TYPE_S32:
    576       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
    577       default:
    578          return;
    579       }
    580       break;
    581    case OP_SUB:
    582       switch (i->dType) {
    583       case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
    584       case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
    585       case TYPE_S32:
    586       case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
    587       default:
    588          return;
    589       }
    590       break;
    591    case OP_POW:
    592       switch (i->dType) {
    593       case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
    594       case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
    595       default:
    596          return;
    597       }
    598       break;
    599    case OP_MAX:
    600       switch (i->dType) {
    601       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
    602       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
    603       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
    604       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
    605       default:
    606          return;
    607       }
    608       break;
    609    case OP_MIN:
    610       switch (i->dType) {
    611       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
    612       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
    613       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
    614       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
    615       default:
    616          return;
    617       }
    618       break;
    619    case OP_AND:
    620       res.data.u64 = a->data.u64 & b->data.u64;
    621       break;
    622    case OP_OR:
    623       res.data.u64 = a->data.u64 | b->data.u64;
    624       break;
    625    case OP_XOR:
    626       res.data.u64 = a->data.u64 ^ b->data.u64;
    627       break;
    628    case OP_SHL:
    629       res.data.u32 = a->data.u32 << b->data.u32;
    630       break;
    631    case OP_SHR:
    632       switch (i->dType) {
    633       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
    634       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
    635       default:
    636          return;
    637       }
    638       break;
    639    case OP_SLCT:
    640       if (a->data.u32 != b->data.u32)
    641          return;
    642       res.data.u32 = a->data.u32;
    643       break;
    644    case OP_EXTBF: {
    645       int offset = b->data.u32 & 0xff;
    646       int width = (b->data.u32 >> 8) & 0xff;
    647       int rshift = offset;
    648       int lshift = 0;
    649       if (width == 0) {
    650          res.data.u32 = 0;
    651          break;
    652       }
    653       if (width + offset < 32) {
    654          rshift = 32 - width;
    655          lshift = 32 - width - offset;
    656       }
    657       if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
    658          res.data.u32 = util_bitreverse(a->data.u32);
    659       else
    660          res.data.u32 = a->data.u32;
    661       switch (i->dType) {
    662       case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
    663       case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
    664       default:
    665          return;
    666       }
    667       break;
    668    }
    669    case OP_POPCNT:
    670       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
    671       break;
    672    case OP_PFETCH:
    673       // The two arguments to pfetch are logically added together. Normally
    674       // the second argument will not be constant, but that can happen.
    675       res.data.u32 = a->data.u32 + b->data.u32;
    676       type = TYPE_U32;
    677       break;
    678    case OP_MERGE:
    679       switch (i->dType) {
    680       case TYPE_U64:
    681       case TYPE_S64:
    682       case TYPE_F64:
    683          res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
    684          break;
    685       default:
    686          return;
    687       }
    688       break;
    689    default:
    690       return;
    691    }
    692    ++foldCount;
    693 
    694    i->src(0).mod = Modifier(0);
    695    i->src(1).mod = Modifier(0);
    696    i->postFactor = 0;
    697 
    698    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
    699    i->setSrc(1, NULL);
    700 
    701    i->getSrc(0)->reg.data = res.data;
    702    i->getSrc(0)->reg.type = type;
    703    i->getSrc(0)->reg.size = typeSizeof(type);
    704 
    705    switch (i->op) {
    706    case OP_MAD:
    707    case OP_FMA: {
    708       ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
    709 
    710       // Move the immediate into position 1, where we know it might be
    711       // emittable. However it might not be anyways, as there may be other
    712       // restrictions, so move it into a separate LValue.
    713       bld.setPosition(i, false);
    714       i->op = OP_ADD;
    715       i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
    716       i->setSrc(0, i->getSrc(2));
    717       i->src(0).mod = i->src(2).mod;
    718       i->setSrc(2, NULL);
    719 
    720       if (i->src(0).getImmediate(src0))
    721          expr(i, src0, src1);
    722       else
    723          opnd(i, src1, 1);
    724       break;
    725    }
    726    case OP_PFETCH:
    727       // Leave PFETCH alone... we just folded its 2 args into 1.
    728       break;
    729    default:
    730       i->op = i->saturate ? OP_SAT : OP_MOV;
    731       if (i->saturate)
    732          unary(i, *i->getSrc(0)->asImm());
    733       break;
    734    }
    735    i->subOp = 0;
    736 }
    737 
    738 void
    739 ConstantFolding::expr(Instruction *i,
    740                       ImmediateValue &imm0,
    741                       ImmediateValue &imm1,
    742                       ImmediateValue &imm2)
    743 {
    744    struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
    745    struct Storage res;
    746 
    747    memset(&res.data, 0, sizeof(res.data));
    748 
    749    switch (i->op) {
    750    case OP_INSBF: {
    751       int offset = b->data.u32 & 0xff;
    752       int width = (b->data.u32 >> 8) & 0xff;
    753       unsigned bitmask = ((1 << width) - 1) << offset;
    754       res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
    755       break;
    756    }
    757    case OP_MAD:
    758    case OP_FMA: {
    759       switch (i->dType) {
    760       case TYPE_F32:
    761          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
    762             c->data.f32;
    763          break;
    764       case TYPE_F64:
    765          res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
    766          break;
    767       case TYPE_S32:
    768          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    769             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
    770             break;
    771          }
    772          /* fallthrough */
    773       case TYPE_U32:
    774          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    775             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
    776             break;
    777          }
    778          res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
    779          break;
    780       default:
    781          return;
    782       }
    783       break;
    784    }
    785    case OP_SHLADD:
    786       res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
    787       break;
    788    default:
    789       return;
    790    }
    791 
    792    ++foldCount;
    793    i->src(0).mod = Modifier(0);
    794    i->src(1).mod = Modifier(0);
    795    i->src(2).mod = Modifier(0);
    796 
    797    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
    798    i->setSrc(1, NULL);
    799    i->setSrc(2, NULL);
    800 
    801    i->getSrc(0)->reg.data = res.data;
    802    i->getSrc(0)->reg.type = i->dType;
    803    i->getSrc(0)->reg.size = typeSizeof(i->dType);
    804 
    805    i->op = OP_MOV;
    806 }
    807 
    808 void
    809 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
    810 {
    811    Storage res;
    812 
    813    if (i->dType != TYPE_F32)
    814       return;
    815    switch (i->op) {
    816    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
    817    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
    818    case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
    819    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
    820    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
    821    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
    822    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
    823    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
    824    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
    825    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
    826    case OP_PRESIN:
    827    case OP_PREEX2:
    828       // these should be handled in subsequent OP_SIN/COS/EX2
    829       res.data.f32 = imm.reg.data.f32;
    830       break;
    831    default:
    832       return;
    833    }
    834    i->op = OP_MOV;
    835    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
    836    i->src(0).mod = Modifier(0);
    837 }
    838 
    839 void
    840 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
    841                                         const int s, ImmediateValue& imm2)
    842 {
    843    const int t = s ? 0 : 1;
    844    Instruction *insn;
    845    Instruction *mul1 = NULL; // mul1 before mul2
    846    int e = 0;
    847    float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
    848    ImmediateValue imm1;
    849 
    850    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
    851 
    852    if (mul2->getSrc(t)->refCount() == 1) {
    853       insn = mul2->getSrc(t)->getInsn();
    854       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
    855          mul1 = insn;
    856       if (mul1 && !mul1->saturate) {
    857          int s1;
    858 
    859          if (mul1->src(s1 = 0).getImmediate(imm1) ||
    860              mul1->src(s1 = 1).getImmediate(imm1)) {
    861             bld.setPosition(mul1, false);
    862             // a = mul r, imm1
    863             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
    864             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
    865             mul1->src(s1).mod = Modifier(0);
    866             mul2->def(0).replace(mul1->getDef(0), false);
    867             mul1->saturate = mul2->saturate;
    868          } else
    869          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
    870             // c = mul a, b
    871             // d = mul c, imm   -> d = mul_x_imm a, b
    872             mul1->postFactor = e;
    873             mul2->def(0).replace(mul1->getDef(0), false);
    874             if (f < 0)
    875                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
    876             mul1->saturate = mul2->saturate;
    877          }
    878          return;
    879       }
    880    }
    881    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
    882       // b = mul a, imm
    883       // d = mul b, c   -> d = mul_x_imm a, c
    884       int s2, t2;
    885       insn = (*mul2->getDef(0)->uses.begin())->getInsn();
    886       if (!insn)
    887          return;
    888       mul1 = mul2;
    889       mul2 = NULL;
    890       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
    891       t2 = s2 ? 0 : 1;
    892       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
    893          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
    894             mul2 = insn;
    895       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
    896          mul2->postFactor = e;
    897          mul2->setSrc(s2, mul1->src(t));
    898          if (f < 0)
    899             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
    900       }
    901    }
    902 }
    903 
    904 void
    905 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
    906 {
    907    switch (i->op) {
    908    case OP_MAD:
    909    case OP_FMA:
    910       if (imm2.isInteger(0)) {
    911          i->op = OP_MUL;
    912          i->setSrc(2, NULL);
    913          foldCount++;
    914          return;
    915       }
    916       break;
    917    case OP_SHLADD:
    918       if (imm2.isInteger(0)) {
    919          i->op = OP_SHL;
    920          i->setSrc(2, NULL);
    921          foldCount++;
    922          return;
    923       }
    924       break;
    925    default:
    926       return;
    927    }
    928 }
    929 
    930 void
    931 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
    932 {
    933    const Target *target = prog->getTarget();
    934    const int t = !s;
    935    const operation op = i->op;
    936    Instruction *newi = i;
    937 
    938    switch (i->op) {
    939    case OP_SPLIT: {
    940       bld.setPosition(i, false);
    941 
    942       uint8_t size = i->getDef(0)->reg.size;
    943       uint8_t bitsize = size * 8;
    944       uint32_t mask = (1ULL << bitsize) - 1;
    945       assert(bitsize <= 32);
    946 
    947       uint64_t val = imm0.reg.data.u64;
    948       for (int8_t d = 0; i->defExists(d); ++d) {
    949          Value *def = i->getDef(d);
    950          assert(def->reg.size == size);
    951 
    952          newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32);
    953          val >>= bitsize;
    954       }
    955       delete_Instruction(prog, i);
    956       break;
    957    }
    958    case OP_MUL:
    959       if (i->dType == TYPE_F32)
    960          tryCollapseChainedMULs(i, s, imm0);
    961 
    962       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    963          assert(!isFloatType(i->sType));
    964          if (imm0.isInteger(1) && i->dType == TYPE_S32) {
    965             bld.setPosition(i, false);
    966             // Need to set to the sign value, which is a compare.
    967             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
    968                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
    969             delete_Instruction(prog, i);
    970          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
    971             // The high bits can't be set in this case (either mul by 0 or
    972             // unsigned by 1)
    973             i->op = OP_MOV;
    974             i->subOp = 0;
    975             i->setSrc(0, new_ImmediateValue(prog, 0u));
    976             i->src(0).mod = Modifier(0);
    977             i->setSrc(1, NULL);
    978          } else if (!imm0.isNegative() && imm0.isPow2()) {
    979             // Translate into a shift
    980             imm0.applyLog2();
    981             i->op = OP_SHR;
    982             i->subOp = 0;
    983             imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
    984             i->setSrc(0, i->getSrc(t));
    985             i->src(0).mod = i->src(t).mod;
    986             i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
    987             i->src(1).mod = 0;
    988          }
    989       } else
    990       if (imm0.isInteger(0)) {
    991          i->op = OP_MOV;
    992          i->setSrc(0, new_ImmediateValue(prog, 0u));
    993          i->src(0).mod = Modifier(0);
    994          i->postFactor = 0;
    995          i->setSrc(1, NULL);
    996       } else
    997       if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
    998          if (imm0.isNegative())
    999             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
   1000          i->op = i->src(t).mod.getOp();
   1001          if (s == 0) {
   1002             i->setSrc(0, i->getSrc(1));
   1003             i->src(0).mod = i->src(1).mod;
   1004             i->src(1).mod = 0;
   1005          }
   1006          if (i->op != OP_CVT)
   1007             i->src(0).mod = 0;
   1008          i->setSrc(1, NULL);
   1009       } else
   1010       if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
   1011          if (imm0.isNegative())
   1012             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
   1013          i->op = OP_ADD;
   1014          i->setSrc(s, i->getSrc(t));
   1015          i->src(s).mod = i->src(t).mod;
   1016       } else
   1017       if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
   1018          i->op = OP_SHL;
   1019          imm0.applyLog2();
   1020          i->setSrc(0, i->getSrc(t));
   1021          i->src(0).mod = i->src(t).mod;
   1022          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
   1023          i->src(1).mod = 0;
   1024       } else
   1025       if (i->postFactor && i->sType == TYPE_F32) {
   1026          /* Can't emit a postfactor with an immediate, have to fold it in */
   1027          i->setSrc(s, new_ImmediateValue(
   1028                       prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
   1029          i->postFactor = 0;
   1030       }
   1031       break;
   1032    case OP_FMA:
   1033    case OP_MAD:
   1034       if (imm0.isInteger(0)) {
   1035          i->setSrc(0, i->getSrc(2));
   1036          i->src(0).mod = i->src(2).mod;
   1037          i->setSrc(1, NULL);
   1038          i->setSrc(2, NULL);
   1039          i->op = i->src(0).mod.getOp();
   1040          if (i->op != OP_CVT)
   1041             i->src(0).mod = 0;
   1042       } else
   1043       if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
   1044           (imm0.isInteger(1) || imm0.isInteger(-1))) {
   1045          if (imm0.isNegative())
   1046             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
   1047          if (s == 0) {
   1048             i->setSrc(0, i->getSrc(1));
   1049             i->src(0).mod = i->src(1).mod;
   1050          }
   1051          i->setSrc(1, i->getSrc(2));
   1052          i->src(1).mod = i->src(2).mod;
   1053          i->setSrc(2, NULL);
   1054          i->op = OP_ADD;
   1055       } else
   1056       if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
   1057           !isFloatType(i->dType) &&
   1058           target->isOpSupported(OP_SHLADD, i->dType)) {
   1059          i->op = OP_SHLADD;
   1060          imm0.applyLog2();
   1061          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
   1062       }
   1063       break;
   1064    case OP_SUB:
   1065       if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
   1066           !isFloatType(i->dType))
   1067          break;
   1068       /* fallthrough */
   1069    case OP_ADD:
   1070       if (i->usesFlags())
   1071          break;
   1072       if (imm0.isInteger(0)) {
   1073          if (s == 0) {
   1074             i->setSrc(0, i->getSrc(1));
   1075             i->src(0).mod = i->src(1).mod;
   1076             if (i->op == OP_SUB)
   1077                i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
   1078          }
   1079          i->setSrc(1, NULL);
   1080          i->op = i->src(0).mod.getOp();
   1081          if (i->op != OP_CVT)
   1082             i->src(0).mod = Modifier(0);
   1083       }
   1084       break;
   1085 
   1086    case OP_DIV:
   1087       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
   1088          break;
   1089       bld.setPosition(i, false);
   1090       if (imm0.reg.data.u32 == 0) {
   1091          break;
   1092       } else
   1093       if (imm0.reg.data.u32 == 1) {
   1094          i->op = OP_MOV;
   1095          i->setSrc(1, NULL);
   1096       } else
   1097       if (i->dType == TYPE_U32 && imm0.isPow2()) {
   1098          i->op = OP_SHR;
   1099          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
   1100       } else
   1101       if (i->dType == TYPE_U32) {
   1102          Instruction *mul;
   1103          Value *tA, *tB;
   1104          const uint32_t d = imm0.reg.data.u32;
   1105          uint32_t m;
   1106          int r, s;
   1107          uint32_t l = util_logbase2(d);
   1108          if (((uint32_t)1 << l) < d)
   1109             ++l;
   1110          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
   1111          r = l ? 1 : 0;
   1112          s = l ? (l - 1) : 0;
   1113 
   1114          tA = bld.getSSA();
   1115          tB = bld.getSSA();
   1116          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
   1117                          bld.loadImm(NULL, m));
   1118          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
   1119          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
   1120          tA = bld.getSSA();
   1121          if (r)
   1122             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
   1123          else
   1124             tA = tB;
   1125          tB = s ? bld.getSSA() : i->getDef(0);
   1126          newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
   1127          if (s)
   1128             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
   1129 
   1130          delete_Instruction(prog, i);
   1131       } else
   1132       if (imm0.reg.data.s32 == -1) {
   1133          i->op = OP_NEG;
   1134          i->setSrc(1, NULL);
   1135       } else {
   1136          LValue *tA, *tB;
   1137          LValue *tD;
   1138          const int32_t d = imm0.reg.data.s32;
   1139          int32_t m;
   1140          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
   1141          if ((1 << l) < abs(d))
   1142             ++l;
   1143          if (!l)
   1144             l = 1;
   1145          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
   1146 
   1147          tA = bld.getSSA();
   1148          tB = bld.getSSA();
   1149          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
   1150                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
   1151          if (l > 1)
   1152             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
   1153          else
   1154             tB = tA;
   1155          tA = bld.getSSA();
   1156          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
   1157          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
   1158          newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
   1159          if (d < 0)
   1160             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
   1161 
   1162          delete_Instruction(prog, i);
   1163       }
   1164       break;
   1165 
   1166    case OP_MOD:
   1167       if (s == 1 && imm0.isPow2()) {
   1168          bld.setPosition(i, false);
   1169          if (i->sType == TYPE_U32) {
   1170             i->op = OP_AND;
   1171             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
   1172          } else if (i->sType == TYPE_S32) {
   1173             // Do it on the absolute value of the input, and then restore the
   1174             // sign. The only odd case is MIN_INT, but that should work out
   1175             // as well, since MIN_INT mod any power of 2 is 0.
   1176             //
   1177             // Technically we don't have to do any of this since MOD is
   1178             // undefined with negative arguments in GLSL, but this seems like
   1179             // the nice thing to do.
   1180             Value *abs = bld.mkOp1v(OP_ABS, TYPE_S32, bld.getSSA(), i->getSrc(0));
   1181             Value *neg, *v1, *v2;
   1182             bld.mkCmp(OP_SET, CC_LT, TYPE_S32,
   1183                       (neg = bld.getSSA(1, prog->getTarget()->nativeFile(FILE_PREDICATE))),
   1184                       TYPE_S32, i->getSrc(0), bld.loadImm(NULL, 0));
   1185             Value *mod = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), abs,
   1186                                     bld.loadImm(NULL, imm0.reg.data.u32 - 1));
   1187             bld.mkOp1(OP_NEG, TYPE_S32, (v1 = bld.getSSA()), mod)
   1188                ->setPredicate(CC_P, neg);
   1189             bld.mkOp1(OP_MOV, TYPE_S32, (v2 = bld.getSSA()), mod)
   1190                ->setPredicate(CC_NOT_P, neg);
   1191             newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
   1192 
   1193             delete_Instruction(prog, i);
   1194          }
   1195       } else if (s == 1) {
   1196          // In this case, we still want the optimized lowering that we get
   1197          // from having division by an immediate.
   1198          //
   1199          // a % b == a - (a/b) * b
   1200          bld.setPosition(i, false);
   1201          Value *div = bld.mkOp2v(OP_DIV, i->sType, bld.getSSA(),
   1202                                  i->getSrc(0), i->getSrc(1));
   1203          newi = bld.mkOp2(OP_ADD, i->sType, i->getDef(0), i->getSrc(0),
   1204                           bld.mkOp2v(OP_MUL, i->sType, bld.getSSA(), div, i->getSrc(1)));
   1205          // TODO: Check that target supports this. In this case, we know that
   1206          // all backends do.
   1207          newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
   1208 
   1209          delete_Instruction(prog, i);
   1210       }
   1211       break;
   1212 
   1213    case OP_SET: // TODO: SET_AND,OR,XOR
   1214    {
   1215       /* This optimizes the case where the output of a set is being compared
   1216        * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
   1217        * can be a lot cleverer in our comparison.
   1218        */
   1219       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
   1220       CondCode cc, ccZ;
   1221       if (imm0.reg.data.u32 != 0 || !si)
   1222          return;
   1223       cc = si->setCond;
   1224       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
   1225       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
   1226       // first.
   1227       if (s == 0)
   1228          ccZ = reverseCondCode(ccZ);
   1229       // If there is a negative modifier, we need to undo that, by flipping
   1230       // the comparison to zero.
   1231       if (i->src(t).mod.neg())
   1232          ccZ = reverseCondCode(ccZ);
   1233       // If this is a signed comparison, we expect the input to be a regular
   1234       // boolean, i.e. 0/-1. However the rest of the logic assumes that true
   1235       // is positive, so just flip the sign.
   1236       if (i->sType == TYPE_S32) {
   1237          assert(!isFloatType(si->dType));
   1238          ccZ = reverseCondCode(ccZ);
   1239       }
   1240       switch (ccZ) {
   1241       case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
   1242       case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
   1243       case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
   1244       case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
   1245       case CC_GT: break; // bool > 0 -- bool
   1246       case CC_NE: break; // bool != 0 -- bool
   1247       default:
   1248          return;
   1249       }
   1250 
   1251       // Update the condition of this SET to be identical to the origin set,
   1252       // but with the updated condition code. The original SET should get
   1253       // DCE'd, ideally.
   1254       i->op = si->op;
   1255       i->asCmp()->setCond = cc;
   1256       i->setSrc(0, si->src(0));
   1257       i->setSrc(1, si->src(1));
   1258       if (si->srcExists(2))
   1259          i->setSrc(2, si->src(2));
   1260       i->sType = si->sType;
   1261    }
   1262       break;
   1263 
   1264    case OP_AND:
   1265    {
   1266       Instruction *src = i->getSrc(t)->getInsn();
   1267       ImmediateValue imm1;
   1268       if (imm0.reg.data.u32 == 0) {
   1269          i->op = OP_MOV;
   1270          i->setSrc(0, new_ImmediateValue(prog, 0u));
   1271          i->src(0).mod = Modifier(0);
   1272          i->setSrc(1, NULL);
   1273       } else if (imm0.reg.data.u32 == ~0U) {
   1274          i->op = i->src(t).mod.getOp();
   1275          if (t) {
   1276             i->setSrc(0, i->getSrc(t));
   1277             i->src(0).mod = i->src(t).mod;
   1278          }
   1279          i->setSrc(1, NULL);
   1280       } else if (src->asCmp()) {
   1281          CmpInstruction *cmp = src->asCmp();
   1282          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
   1283             return;
   1284          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
   1285             return;
   1286          if (imm0.reg.data.f32 != 1.0)
   1287             return;
   1288          if (cmp->dType != TYPE_U32)
   1289             return;
   1290 
   1291          cmp->dType = TYPE_F32;
   1292          if (i->src(t).mod != Modifier(0)) {
   1293             assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
   1294             i->src(t).mod = Modifier(0);
   1295             cmp->setCond = inverseCondCode(cmp->setCond);
   1296          }
   1297          i->op = OP_MOV;
   1298          i->setSrc(s, NULL);
   1299          if (t) {
   1300             i->setSrc(0, i->getSrc(t));
   1301             i->setSrc(t, NULL);
   1302          }
   1303       } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
   1304                  src->op == OP_SHR &&
   1305                  src->src(1).getImmediate(imm1) &&
   1306                  i->src(t).mod == Modifier(0) &&
   1307                  util_is_power_of_two(imm0.reg.data.u32 + 1)) {
   1308          // low byte = offset, high byte = width
   1309          uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
   1310          i->op = OP_EXTBF;
   1311          i->setSrc(0, src->getSrc(0));
   1312          i->setSrc(1, new_ImmediateValue(prog, ext));
   1313       } else if (src->op == OP_SHL &&
   1314                  src->src(1).getImmediate(imm1) &&
   1315                  i->src(t).mod == Modifier(0) &&
   1316                  util_is_power_of_two(~imm0.reg.data.u32 + 1) &&
   1317                  util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
   1318          i->op = OP_MOV;
   1319          i->setSrc(s, NULL);
   1320          if (t) {
   1321             i->setSrc(0, i->getSrc(t));
   1322             i->setSrc(t, NULL);
   1323          }
   1324       }
   1325    }
   1326       break;
   1327 
   1328    case OP_SHL:
   1329    {
   1330       if (s != 1 || i->src(0).mod != Modifier(0))
   1331          break;
   1332       // try to concatenate shifts
   1333       Instruction *si = i->getSrc(0)->getInsn();
   1334       if (!si)
   1335          break;
   1336       ImmediateValue imm1;
   1337       switch (si->op) {
   1338       case OP_SHL:
   1339          if (si->src(1).getImmediate(imm1)) {
   1340             bld.setPosition(i, false);
   1341             i->setSrc(0, si->getSrc(0));
   1342             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
   1343          }
   1344          break;
   1345       case OP_SHR:
   1346          if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
   1347             bld.setPosition(i, false);
   1348             i->op = OP_AND;
   1349             i->setSrc(0, si->getSrc(0));
   1350             i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
   1351          }
   1352          break;
   1353       case OP_MUL:
   1354          int muls;
   1355          if (isFloatType(si->dType))
   1356             return;
   1357          if (si->src(1).getImmediate(imm1))
   1358             muls = 1;
   1359          else if (si->src(0).getImmediate(imm1))
   1360             muls = 0;
   1361          else
   1362             return;
   1363 
   1364          bld.setPosition(i, false);
   1365          i->op = OP_MUL;
   1366          i->setSrc(0, si->getSrc(!muls));
   1367          i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
   1368          break;
   1369       case OP_SUB:
   1370       case OP_ADD:
   1371          int adds;
   1372          if (isFloatType(si->dType))
   1373             return;
   1374          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
   1375             adds = 0;
   1376          else if (si->src(1).getImmediate(imm1))
   1377             adds = 1;
   1378          else
   1379             return;
   1380          if (si->src(!adds).mod != Modifier(0))
   1381             return;
   1382          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
   1383 
   1384          // This is more operations, but if one of x, y is an immediate, then
   1385          // we can get a situation where (a) we can use ISCADD, or (b)
   1386          // propagate the add bit into an indirect load.
   1387          bld.setPosition(i, false);
   1388          i->op = si->op;
   1389          i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
   1390          i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
   1391                                      bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
   1392                                      si->getSrc(!adds),
   1393                                      bld.mkImm(imm0.reg.data.u32)));
   1394          break;
   1395       default:
   1396          return;
   1397       }
   1398    }
   1399       break;
   1400 
   1401    case OP_ABS:
   1402    case OP_NEG:
   1403    case OP_SAT:
   1404    case OP_LG2:
   1405    case OP_RCP:
   1406    case OP_SQRT:
   1407    case OP_RSQ:
   1408    case OP_PRESIN:
   1409    case OP_SIN:
   1410    case OP_COS:
   1411    case OP_PREEX2:
   1412    case OP_EX2:
   1413       unary(i, imm0);
   1414       break;
   1415    case OP_BFIND: {
   1416       int32_t res;
   1417       switch (i->dType) {
   1418       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
   1419       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
   1420       default:
   1421          return;
   1422       }
   1423       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
   1424          res = 31 - res;
   1425       bld.setPosition(i, false); /* make sure bld is init'ed */
   1426       i->setSrc(0, bld.mkImm(res));
   1427       i->setSrc(1, NULL);
   1428       i->op = OP_MOV;
   1429       i->subOp = 0;
   1430       break;
   1431    }
   1432    case OP_POPCNT: {
   1433       // Only deal with 1-arg POPCNT here
   1434       if (i->srcExists(1))
   1435          break;
   1436       uint32_t res = util_bitcount(imm0.reg.data.u32);
   1437       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
   1438       i->setSrc(1, NULL);
   1439       i->op = OP_MOV;
   1440       break;
   1441    }
   1442    case OP_CVT: {
   1443       Storage res;
   1444 
   1445       // TODO: handle 64-bit values properly
   1446       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
   1447          return;
   1448 
   1449       // TODO: handle single byte/word extractions
   1450       if (i->subOp)
   1451          return;
   1452 
   1453       bld.setPosition(i, true); /* make sure bld is init'ed */
   1454 
   1455 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
   1456    case type: \
   1457       switch (i->sType) { \
   1458       case TYPE_F64: \
   1459          res.data.dst = util_iround(i->saturate ? \
   1460                                     CLAMP(imm0.reg.data.f64, fmin, fmax) : \
   1461                                     imm0.reg.data.f64); \
   1462          break; \
   1463       case TYPE_F32: \
   1464          res.data.dst = util_iround(i->saturate ? \
   1465                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
   1466                                     imm0.reg.data.f32); \
   1467          break; \
   1468       case TYPE_S32: \
   1469          res.data.dst = i->saturate ? \
   1470                         CLAMP(imm0.reg.data.s32, imin, imax) : \
   1471                         imm0.reg.data.s32; \
   1472          break; \
   1473       case TYPE_U32: \
   1474          res.data.dst = i->saturate ? \
   1475                         CLAMP(imm0.reg.data.u32, umin, umax) : \
   1476                         imm0.reg.data.u32; \
   1477          break; \
   1478       case TYPE_S16: \
   1479          res.data.dst = i->saturate ? \
   1480                         CLAMP(imm0.reg.data.s16, imin, imax) : \
   1481                         imm0.reg.data.s16; \
   1482          break; \
   1483       case TYPE_U16: \
   1484          res.data.dst = i->saturate ? \
   1485                         CLAMP(imm0.reg.data.u16, umin, umax) : \
   1486                         imm0.reg.data.u16; \
   1487          break; \
   1488       default: return; \
   1489       } \
   1490       i->setSrc(0, bld.mkImm(res.data.dst)); \
   1491       break
   1492 
   1493       switch(i->dType) {
   1494       CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
   1495       CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
   1496       CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
   1497       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
   1498       case TYPE_F32:
   1499          switch (i->sType) {
   1500          case TYPE_F64:
   1501             res.data.f32 = i->saturate ?
   1502                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
   1503                imm0.reg.data.f64;
   1504             break;
   1505          case TYPE_F32:
   1506             res.data.f32 = i->saturate ?
   1507                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
   1508                imm0.reg.data.f32;
   1509             break;
   1510          case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
   1511          case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
   1512          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
   1513          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
   1514          default:
   1515             return;
   1516          }
   1517          i->setSrc(0, bld.mkImm(res.data.f32));
   1518          break;
   1519       case TYPE_F64:
   1520          switch (i->sType) {
   1521          case TYPE_F64:
   1522             res.data.f64 = i->saturate ?
   1523                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
   1524                imm0.reg.data.f64;
   1525             break;
   1526          case TYPE_F32:
   1527             res.data.f64 = i->saturate ?
   1528                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
   1529                imm0.reg.data.f32;
   1530             break;
   1531          case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
   1532          case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
   1533          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
   1534          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
   1535          default:
   1536             return;
   1537          }
   1538          i->setSrc(0, bld.mkImm(res.data.f64));
   1539          break;
   1540       default:
   1541          return;
   1542       }
   1543 #undef CASE
   1544 
   1545       i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
   1546       i->op = OP_MOV;
   1547       i->saturate = 0;
   1548       i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
   1549       break;
   1550    }
   1551    default:
   1552       return;
   1553    }
   1554 
   1555    // This can get left behind some of the optimizations which simplify
   1556    // saturatable values.
   1557    if (newi->op == OP_MOV && newi->saturate) {
   1558       ImmediateValue tmp;
   1559       newi->saturate = 0;
   1560       newi->op = OP_SAT;
   1561       if (newi->src(0).getImmediate(tmp))
   1562          unary(newi, tmp);
   1563    }
   1564 
   1565    if (newi->op != op)
   1566       foldCount++;
   1567 }
   1568 
   1569 // =============================================================================
   1570 
   1571 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
   1572 class ModifierFolding : public Pass
   1573 {
   1574 private:
   1575    virtual bool visit(BasicBlock *);
   1576 };
   1577 
   1578 bool
   1579 ModifierFolding::visit(BasicBlock *bb)
   1580 {
   1581    const Target *target = prog->getTarget();
   1582 
   1583    Instruction *i, *next, *mi;
   1584    Modifier mod;
   1585 
   1586    for (i = bb->getEntry(); i; i = next) {
   1587       next = i->next;
   1588 
   1589       if (0 && i->op == OP_SUB) {
   1590          // turn "sub" into "add neg" (do we really want this ?)
   1591          i->op = OP_ADD;
   1592          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
   1593       }
   1594 
   1595       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
   1596          mi = i->getSrc(s)->getInsn();
   1597          if (!mi ||
   1598              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
   1599             continue;
   1600          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
   1601             if ((i->op != OP_ADD &&
   1602                  i->op != OP_MUL) ||
   1603                 (mi->op != OP_ABS &&
   1604                  mi->op != OP_NEG))
   1605                continue;
   1606          } else
   1607          if (i->sType != mi->dType) {
   1608             continue;
   1609          }
   1610          if ((mod = Modifier(mi->op)) == Modifier(0))
   1611             continue;
   1612          mod *= mi->src(0).mod;
   1613 
   1614          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
   1615             // abs neg [abs] = abs
   1616             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
   1617          } else
   1618          if ((i->op == OP_NEG) && mod.neg()) {
   1619             assert(s == 0);
   1620             // neg as both opcode and modifier on same insn is prohibited
   1621             // neg neg abs = abs, neg neg = identity
   1622             mod = mod & Modifier(~NV50_IR_MOD_NEG);
   1623             i->op = mod.getOp();
   1624             mod = mod & Modifier(~NV50_IR_MOD_ABS);
   1625             if (mod == Modifier(0))
   1626                i->op = OP_MOV;
   1627          }
   1628 
   1629          if (target->isModSupported(i, s, mod)) {
   1630             i->setSrc(s, mi->getSrc(0));
   1631             i->src(s).mod *= mod;
   1632          }
   1633       }
   1634 
   1635       if (i->op == OP_SAT) {
   1636          mi = i->getSrc(0)->getInsn();
   1637          if (mi &&
   1638              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
   1639             mi->saturate = 1;
   1640             mi->setDef(0, i->getDef(0));
   1641             delete_Instruction(prog, i);
   1642          }
   1643       }
   1644    }
   1645 
   1646    return true;
   1647 }
   1648 
   1649 // =============================================================================
   1650 
   1651 // MUL + ADD -> MAD/FMA
   1652 // MIN/MAX(a, a) -> a, etc.
   1653 // SLCT(a, b, const) -> cc(const) ? a : b
   1654 // RCP(RCP(a)) -> a
   1655 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
   1656 class AlgebraicOpt : public Pass
   1657 {
   1658 private:
   1659    virtual bool visit(BasicBlock *);
   1660 
   1661    void handleABS(Instruction *);
   1662    bool handleADD(Instruction *);
   1663    bool tryADDToMADOrSAD(Instruction *, operation toOp);
   1664    void handleMINMAX(Instruction *);
   1665    void handleRCP(Instruction *);
   1666    void handleSLCT(Instruction *);
   1667    void handleLOGOP(Instruction *);
   1668    void handleCVT_NEG(Instruction *);
   1669    void handleCVT_CVT(Instruction *);
   1670    void handleCVT_EXTBF(Instruction *);
   1671    void handleSUCLAMP(Instruction *);
   1672    void handleNEG(Instruction *);
   1673 
   1674    BuildUtil bld;
   1675 };
   1676 
   1677 void
   1678 AlgebraicOpt::handleABS(Instruction *abs)
   1679 {
   1680    Instruction *sub = abs->getSrc(0)->getInsn();
   1681    DataType ty;
   1682    if (!sub ||
   1683        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
   1684       return;
   1685    // expect not to have mods yet, if we do, bail
   1686    if (sub->src(0).mod || sub->src(1).mod)
   1687       return;
   1688    // hidden conversion ?
   1689    ty = intTypeToSigned(sub->dType);
   1690    if (abs->dType != abs->sType || ty != abs->sType)
   1691       return;
   1692 
   1693    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
   1694        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
   1695        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
   1696          return;
   1697 
   1698    Value *src0 = sub->getSrc(0);
   1699    Value *src1 = sub->getSrc(1);
   1700 
   1701    if (sub->op == OP_ADD) {
   1702       Instruction *neg = sub->getSrc(1)->getInsn();
   1703       if (neg && neg->op != OP_NEG) {
   1704          neg = sub->getSrc(0)->getInsn();
   1705          src0 = sub->getSrc(1);
   1706       }
   1707       if (!neg || neg->op != OP_NEG ||
   1708           neg->dType != neg->sType || neg->sType != ty)
   1709          return;
   1710       src1 = neg->getSrc(0);
   1711    }
   1712 
   1713    // found ABS(SUB))
   1714    abs->moveSources(1, 2); // move sources >=1 up by 2
   1715    abs->op = OP_SAD;
   1716    abs->setType(sub->dType);
   1717    abs->setSrc(0, src0);
   1718    abs->setSrc(1, src1);
   1719    bld.setPosition(abs, false);
   1720    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
   1721 }
   1722 
   1723 bool
   1724 AlgebraicOpt::handleADD(Instruction *add)
   1725 {
   1726    Value *src0 = add->getSrc(0);
   1727    Value *src1 = add->getSrc(1);
   1728 
   1729    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
   1730       return false;
   1731 
   1732    bool changed = false;
   1733    // we can't optimize to MAD if the add is precise
   1734    if (!add->precise && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
   1735       changed = tryADDToMADOrSAD(add, OP_MAD);
   1736    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
   1737       changed = tryADDToMADOrSAD(add, OP_SAD);
   1738    return changed;
   1739 }
   1740 
   1741 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
   1742 // ADD(MUL(a,b), c) -> MAD(a,b,c)
   1743 bool
   1744 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
   1745 {
   1746    Value *src0 = add->getSrc(0);
   1747    Value *src1 = add->getSrc(1);
   1748    Value *src;
   1749    int s;
   1750    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
   1751    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
   1752    Modifier mod[4];
   1753 
   1754    if (src0->refCount() == 1 &&
   1755        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
   1756       s = 0;
   1757    else
   1758    if (src1->refCount() == 1 &&
   1759        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
   1760       s = 1;
   1761    else
   1762       return false;
   1763 
   1764    src = add->getSrc(s);
   1765 
   1766    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
   1767       return false;
   1768 
   1769    if (src->getInsn()->saturate || src->getInsn()->postFactor ||
   1770        src->getInsn()->dnz || src->getInsn()->precise)
   1771       return false;
   1772 
   1773    if (toOp == OP_SAD) {
   1774       ImmediateValue imm;
   1775       if (!src->getInsn()->src(2).getImmediate(imm))
   1776          return false;
   1777       if (!imm.isInteger(0))
   1778          return false;
   1779    }
   1780 
   1781    if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
   1782        isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
   1783       return false;
   1784 
   1785    mod[0] = add->src(0).mod;
   1786    mod[1] = add->src(1).mod;
   1787    mod[2] = src->getUniqueInsn()->src(0).mod;
   1788    mod[3] = src->getUniqueInsn()->src(1).mod;
   1789 
   1790    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
   1791       return false;
   1792 
   1793    add->op = toOp;
   1794    add->subOp = src->getInsn()->subOp; // potentially mul-high
   1795    add->dnz = src->getInsn()->dnz;
   1796    add->dType = src->getInsn()->dType; // sign matters for imad hi
   1797    add->sType = src->getInsn()->sType;
   1798 
   1799    add->setSrc(2, add->src(s ? 0 : 1));
   1800 
   1801    add->setSrc(0, src->getInsn()->getSrc(0));
   1802    add->src(0).mod = mod[2] ^ mod[s];
   1803    add->setSrc(1, src->getInsn()->getSrc(1));
   1804    add->src(1).mod = mod[3];
   1805 
   1806    return true;
   1807 }
   1808 
   1809 void
   1810 AlgebraicOpt::handleMINMAX(Instruction *minmax)
   1811 {
   1812    Value *src0 = minmax->getSrc(0);
   1813    Value *src1 = minmax->getSrc(1);
   1814 
   1815    if (src0 != src1 || src0->reg.file != FILE_GPR)
   1816       return;
   1817    if (minmax->src(0).mod == minmax->src(1).mod) {
   1818       if (minmax->def(0).mayReplace(minmax->src(0))) {
   1819          minmax->def(0).replace(minmax->src(0), false);
   1820          minmax->bb->remove(minmax);
   1821       } else {
   1822          minmax->op = OP_CVT;
   1823          minmax->setSrc(1, NULL);
   1824       }
   1825    } else {
   1826       // TODO:
   1827       // min(x, -x) = -abs(x)
   1828       // min(x, -abs(x)) = -abs(x)
   1829       // min(x, abs(x)) = x
   1830       // max(x, -abs(x)) = x
   1831       // max(x, abs(x)) = abs(x)
   1832       // max(x, -x) = abs(x)
   1833    }
   1834 }
   1835 
   1836 void
   1837 AlgebraicOpt::handleRCP(Instruction *rcp)
   1838 {
   1839    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
   1840 
   1841    if (si && si->op == OP_RCP) {
   1842       Modifier mod = rcp->src(0).mod * si->src(0).mod;
   1843       rcp->op = mod.getOp();
   1844       rcp->setSrc(0, si->getSrc(0));
   1845    }
   1846 }
   1847 
   1848 void
   1849 AlgebraicOpt::handleSLCT(Instruction *slct)
   1850 {
   1851    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
   1852       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
   1853          slct->setSrc(0, slct->getSrc(1));
   1854    } else
   1855    if (slct->getSrc(0) != slct->getSrc(1)) {
   1856       return;
   1857    }
   1858    slct->op = OP_MOV;
   1859    slct->setSrc(1, NULL);
   1860    slct->setSrc(2, NULL);
   1861 }
   1862 
   1863 void
   1864 AlgebraicOpt::handleLOGOP(Instruction *logop)
   1865 {
   1866    Value *src0 = logop->getSrc(0);
   1867    Value *src1 = logop->getSrc(1);
   1868 
   1869    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
   1870       return;
   1871 
   1872    if (src0 == src1) {
   1873       if ((logop->op == OP_AND || logop->op == OP_OR) &&
   1874           logop->def(0).mayReplace(logop->src(0))) {
   1875          logop->def(0).replace(logop->src(0), false);
   1876          delete_Instruction(prog, logop);
   1877       }
   1878    } else {
   1879       // try AND(SET, SET) -> SET_AND(SET)
   1880       Instruction *set0 = src0->getInsn();
   1881       Instruction *set1 = src1->getInsn();
   1882 
   1883       if (!set0 || set0->fixed || !set1 || set1->fixed)
   1884          return;
   1885       if (set1->op != OP_SET) {
   1886          Instruction *xchg = set0;
   1887          set0 = set1;
   1888          set1 = xchg;
   1889          if (set1->op != OP_SET)
   1890             return;
   1891       }
   1892       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
   1893                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
   1894       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
   1895          return;
   1896       if (set0->op != OP_SET &&
   1897           set0->op != OP_SET_AND &&
   1898           set0->op != OP_SET_OR &&
   1899           set0->op != OP_SET_XOR)
   1900          return;
   1901       if (set0->getDef(0)->refCount() > 1 &&
   1902           set1->getDef(0)->refCount() > 1)
   1903          return;
   1904       if (set0->getPredicate() || set1->getPredicate())
   1905          return;
   1906       // check that they don't source each other
   1907       for (int s = 0; s < 2; ++s)
   1908          if (set0->getSrc(s) == set1->getDef(0) ||
   1909              set1->getSrc(s) == set0->getDef(0))
   1910             return;
   1911 
   1912       set0 = cloneForward(func, set0);
   1913       set1 = cloneShallow(func, set1);
   1914       logop->bb->insertAfter(logop, set1);
   1915       logop->bb->insertAfter(logop, set0);
   1916 
   1917       set0->dType = TYPE_U8;
   1918       set0->getDef(0)->reg.file = FILE_PREDICATE;
   1919       set0->getDef(0)->reg.size = 1;
   1920       set1->setSrc(2, set0->getDef(0));
   1921       set1->op = redOp;
   1922       set1->setDef(0, logop->getDef(0));
   1923       delete_Instruction(prog, logop);
   1924    }
   1925 }
   1926 
   1927 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
   1928 // nv50:
   1929 //  F2I(NEG(I2F(ABS(SET))))
   1930 void
   1931 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
   1932 {
   1933    Instruction *insn = cvt->getSrc(0)->getInsn();
   1934    if (cvt->sType != TYPE_F32 ||
   1935        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
   1936       return;
   1937    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
   1938       return;
   1939    if (insn->src(0).mod != Modifier(0))
   1940       return;
   1941    insn = insn->getSrc(0)->getInsn();
   1942 
   1943    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
   1944    if (insn && insn->op == OP_CVT &&
   1945        insn->dType == TYPE_F32 &&
   1946        insn->sType == TYPE_S32) {
   1947       insn = insn->getSrc(0)->getInsn();
   1948       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
   1949           insn->src(0).mod)
   1950          return;
   1951       insn = insn->getSrc(0)->getInsn();
   1952       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
   1953          return;
   1954    } else
   1955    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
   1956       return;
   1957    }
   1958 
   1959    Instruction *bset = cloneShallow(func, insn);
   1960    bset->dType = TYPE_U32;
   1961    bset->setDef(0, cvt->getDef(0));
   1962    cvt->bb->insertAfter(cvt, bset);
   1963    delete_Instruction(prog, cvt);
   1964 }
   1965 
   1966 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
   1967 // does a type conversion, this becomes trickier as there might be range
   1968 // changes/etc. We could handle those in theory as long as the range was being
   1969 // reduced or kept the same.
   1970 void
   1971 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
   1972 {
   1973    Instruction *insn = cvt->getSrc(0)->getInsn();
   1974    RoundMode rnd = insn->rnd;
   1975 
   1976    if (insn->saturate ||
   1977        insn->subOp ||
   1978        insn->dType != insn->sType ||
   1979        insn->dType != cvt->sType)
   1980       return;
   1981 
   1982    switch (insn->op) {
   1983    case OP_CEIL:
   1984       rnd = ROUND_PI;
   1985       break;
   1986    case OP_FLOOR:
   1987       rnd = ROUND_MI;
   1988       break;
   1989    case OP_TRUNC:
   1990       rnd = ROUND_ZI;
   1991       break;
   1992    case OP_CVT:
   1993       break;
   1994    default:
   1995       return;
   1996    }
   1997 
   1998    if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
   1999       rnd = (RoundMode)(rnd & 3);
   2000 
   2001    cvt->rnd = rnd;
   2002    cvt->setSrc(0, insn->getSrc(0));
   2003    cvt->src(0).mod *= insn->src(0).mod;
   2004    cvt->sType = insn->sType;
   2005 }
   2006 
   2007 // Some shaders extract packed bytes out of words and convert them to
   2008 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
   2009 // nv50 for word sizes.
   2010 //
   2011 // CVT(EXTBF(x, byte/word))
   2012 // CVT(AND(bytemask, x))
   2013 // CVT(AND(bytemask, SHR(x, 8/16/24)))
   2014 // CVT(SHR(x, 16/24))
   2015 void
   2016 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
   2017 {
   2018    Instruction *insn = cvt->getSrc(0)->getInsn();
   2019    ImmediateValue imm;
   2020    Value *arg = NULL;
   2021    unsigned width, offset;
   2022    if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
   2023       return;
   2024    if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
   2025       width = (imm.reg.data.u32 >> 8) & 0xff;
   2026       offset = imm.reg.data.u32 & 0xff;
   2027       arg = insn->getSrc(0);
   2028 
   2029       if (width != 8 && width != 16)
   2030          return;
   2031       if (width == 8 && offset & 0x7)
   2032          return;
   2033       if (width == 16 && offset & 0xf)
   2034          return;
   2035    } else if (insn->op == OP_AND) {
   2036       int s;
   2037       if (insn->src(0).getImmediate(imm))
   2038          s = 0;
   2039       else if (insn->src(1).getImmediate(imm))
   2040          s = 1;
   2041       else
   2042          return;
   2043 
   2044       if (imm.reg.data.u32 == 0xff)
   2045          width = 8;
   2046       else if (imm.reg.data.u32 == 0xffff)
   2047          width = 16;
   2048       else
   2049          return;
   2050 
   2051       arg = insn->getSrc(!s);
   2052       Instruction *shift = arg->getInsn();
   2053       offset = 0;
   2054       if (shift && shift->op == OP_SHR &&
   2055           shift->sType == cvt->sType &&
   2056           shift->src(1).getImmediate(imm) &&
   2057           ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
   2058            (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
   2059          arg = shift->getSrc(0);
   2060          offset = imm.reg.data.u32;
   2061       }
   2062       // We just AND'd the high bits away, which means this is effectively an
   2063       // unsigned value.
   2064       cvt->sType = TYPE_U32;
   2065    } else if (insn->op == OP_SHR &&
   2066               insn->sType == cvt->sType &&
   2067               insn->src(1).getImmediate(imm)) {
   2068       arg = insn->getSrc(0);
   2069       if (imm.reg.data.u32 == 24) {
   2070          width = 8;
   2071          offset = 24;
   2072       } else if (imm.reg.data.u32 == 16) {
   2073          width = 16;
   2074          offset = 16;
   2075       } else {
   2076          return;
   2077       }
   2078    }
   2079 
   2080    if (!arg)
   2081       return;
   2082 
   2083    // Irrespective of what came earlier, we can undo a shift on the argument
   2084    // by adjusting the offset.
   2085    Instruction *shift = arg->getInsn();
   2086    if (shift && shift->op == OP_SHL &&
   2087        shift->src(1).getImmediate(imm) &&
   2088        ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
   2089         (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
   2090        imm.reg.data.u32 <= offset) {
   2091       arg = shift->getSrc(0);
   2092       offset -= imm.reg.data.u32;
   2093    }
   2094 
   2095    // The unpackSnorm lowering still leaves a few shifts behind, but it's too
   2096    // annoying to detect them.
   2097 
   2098    if (width == 8) {
   2099       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
   2100    } else {
   2101       assert(width == 16);
   2102       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
   2103    }
   2104    cvt->setSrc(0, arg);
   2105    cvt->subOp = offset >> 3;
   2106 }
   2107 
   2108 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
   2109 void
   2110 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
   2111 {
   2112    ImmediateValue imm;
   2113    int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
   2114    int s;
   2115    Instruction *add;
   2116 
   2117    assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
   2118 
   2119    // look for ADD (TODO: only count references by non-SUCLAMP)
   2120    if (insn->getSrc(0)->refCount() > 1)
   2121       return;
   2122    add = insn->getSrc(0)->getInsn();
   2123    if (!add || add->op != OP_ADD ||
   2124        (add->dType != TYPE_U32 &&
   2125         add->dType != TYPE_S32))
   2126       return;
   2127 
   2128    // look for immediate
   2129    for (s = 0; s < 2; ++s)
   2130       if (add->src(s).getImmediate(imm))
   2131          break;
   2132    if (s >= 2)
   2133       return;
   2134    s = s ? 0 : 1;
   2135    // determine if immediate fits
   2136    val += imm.reg.data.s32;
   2137    if (val > 31 || val < -32)
   2138       return;
   2139    // determine if other addend fits
   2140    if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
   2141       return;
   2142 
   2143    bld.setPosition(insn, false); // make sure bld is init'ed
   2144    // replace sources
   2145    insn->setSrc(2, bld.mkImm(val));
   2146    insn->setSrc(0, add->getSrc(s));
   2147 }
   2148 
   2149 // NEG(AND(SET, 1)) -> SET
   2150 void
   2151 AlgebraicOpt::handleNEG(Instruction *i) {
   2152    Instruction *src = i->getSrc(0)->getInsn();
   2153    ImmediateValue imm;
   2154    int b;
   2155 
   2156    if (isFloatType(i->sType) || !src || src->op != OP_AND)
   2157       return;
   2158 
   2159    if (src->src(0).getImmediate(imm))
   2160       b = 1;
   2161    else if (src->src(1).getImmediate(imm))
   2162       b = 0;
   2163    else
   2164       return;
   2165 
   2166    if (!imm.isInteger(1))
   2167       return;
   2168 
   2169    Instruction *set = src->getSrc(b)->getInsn();
   2170    if ((set->op == OP_SET || set->op == OP_SET_AND ||
   2171        set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
   2172        !isFloatType(set->dType)) {
   2173       i->def(0).replace(set->getDef(0), false);
   2174    }
   2175 }
   2176 
   2177 bool
   2178 AlgebraicOpt::visit(BasicBlock *bb)
   2179 {
   2180    Instruction *next;
   2181    for (Instruction *i = bb->getEntry(); i; i = next) {
   2182       next = i->next;
   2183       switch (i->op) {
   2184       case OP_ABS:
   2185          handleABS(i);
   2186          break;
   2187       case OP_ADD:
   2188          handleADD(i);
   2189          break;
   2190       case OP_RCP:
   2191          handleRCP(i);
   2192          break;
   2193       case OP_MIN:
   2194       case OP_MAX:
   2195          handleMINMAX(i);
   2196          break;
   2197       case OP_SLCT:
   2198          handleSLCT(i);
   2199          break;
   2200       case OP_AND:
   2201       case OP_OR:
   2202       case OP_XOR:
   2203          handleLOGOP(i);
   2204          break;
   2205       case OP_CVT:
   2206          handleCVT_NEG(i);
   2207          handleCVT_CVT(i);
   2208          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
   2209              handleCVT_EXTBF(i);
   2210          break;
   2211       case OP_SUCLAMP:
   2212          handleSUCLAMP(i);
   2213          break;
   2214       case OP_NEG:
   2215          handleNEG(i);
   2216          break;
   2217       default:
   2218          break;
   2219       }
   2220    }
   2221 
   2222    return true;
   2223 }
   2224 
   2225 // =============================================================================
   2226 
   2227 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
   2228 class LateAlgebraicOpt : public Pass
   2229 {
   2230 private:
   2231    virtual bool visit(Instruction *);
   2232 
   2233    void handleADD(Instruction *);
   2234    bool tryADDToSHLADD(Instruction *);
   2235 };
   2236 
   2237 void
   2238 LateAlgebraicOpt::handleADD(Instruction *add)
   2239 {
   2240    Value *src0 = add->getSrc(0);
   2241    Value *src1 = add->getSrc(1);
   2242 
   2243    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
   2244       return;
   2245 
   2246    if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
   2247       tryADDToSHLADD(add);
   2248 }
   2249 
   2250 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
   2251 bool
   2252 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
   2253 {
   2254    Value *src0 = add->getSrc(0);
   2255    Value *src1 = add->getSrc(1);
   2256    ImmediateValue imm;
   2257    Instruction *shl;
   2258    Value *src;
   2259    int s;
   2260 
   2261    if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
   2262        || isFloatType(add->dType))
   2263       return false;
   2264 
   2265    if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
   2266       s = 0;
   2267    else
   2268    if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
   2269       s = 1;
   2270    else
   2271       return false;
   2272 
   2273    src = add->getSrc(s);
   2274    shl = src->getUniqueInsn();
   2275 
   2276    if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
   2277       return false;
   2278 
   2279    if (!shl->src(1).getImmediate(imm))
   2280       return false;
   2281 
   2282    add->op = OP_SHLADD;
   2283    add->setSrc(2, add->src(!s));
   2284    // SHL can't have any modifiers, but the ADD source may have had
   2285    // one. Preserve it.
   2286    add->setSrc(0, shl->getSrc(0));
   2287    if (s == 1)
   2288       add->src(0).mod = add->src(1).mod;
   2289    add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
   2290    add->src(1).mod = Modifier(0);
   2291 
   2292    return true;
   2293 }
   2294 
   2295 bool
   2296 LateAlgebraicOpt::visit(Instruction *i)
   2297 {
   2298    switch (i->op) {
   2299    case OP_ADD:
   2300       handleADD(i);
   2301       break;
   2302    default:
   2303       break;
   2304    }
   2305 
   2306    return true;
   2307 }
   2308 
   2309 // =============================================================================
   2310 
   2311 // Split 64-bit MUL and MAD
   2312 class Split64BitOpPreRA : public Pass
   2313 {
   2314 private:
   2315    virtual bool visit(BasicBlock *);
   2316    void split64MulMad(Function *, Instruction *, DataType);
   2317 
   2318    BuildUtil bld;
   2319 };
   2320 
   2321 bool
   2322 Split64BitOpPreRA::visit(BasicBlock *bb)
   2323 {
   2324    Instruction *i, *next;
   2325    Modifier mod;
   2326 
   2327    for (i = bb->getEntry(); i; i = next) {
   2328       next = i->next;
   2329 
   2330       DataType hTy;
   2331       switch (i->dType) {
   2332       case TYPE_U64: hTy = TYPE_U32; break;
   2333       case TYPE_S64: hTy = TYPE_S32; break;
   2334       default:
   2335          continue;
   2336       }
   2337 
   2338       if (i->op == OP_MAD || i->op == OP_MUL)
   2339          split64MulMad(func, i, hTy);
   2340    }
   2341 
   2342    return true;
   2343 }
   2344 
   2345 void
   2346 Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
   2347 {
   2348    assert(i->op == OP_MAD || i->op == OP_MUL);
   2349    assert(!isFloatType(i->dType) && !isFloatType(i->sType));
   2350    assert(typeSizeof(hTy) == 4);
   2351 
   2352    bld.setPosition(i, true);
   2353 
   2354    Value *zero = bld.mkImm(0u);
   2355    Value *carry = bld.getSSA(1, FILE_FLAGS);
   2356 
   2357    // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
   2358    // values (a, b and c might be 32-bit values), using 32-bit operations. This
   2359    // gives the following operations:
   2360    // * `d.low = low(a.low * b.low) (+ c.low)?`
   2361    // * `d.high = low(a.high * b.low) + low(a.low * b.high)
   2362    //           + high(a.low * b.low) (+ c.high)?`
   2363    //
   2364    // To compute the high bits, we can split in the following operations:
   2365    // * `tmp1   = low(a.high * b.low) (+ c.high)?`
   2366    // * `tmp2   = low(a.low * b.high) + tmp1`
   2367    // * `d.high = high(a.low * b.low) + tmp2`
   2368    //
   2369    // mkSplit put lower bits at index 0 and higher bits at index 1
   2370 
   2371    Value *op1[2];
   2372    if (i->getSrc(0)->reg.size == 8)
   2373       bld.mkSplit(op1, 4, i->getSrc(0));
   2374    else {
   2375       op1[0] = i->getSrc(0);
   2376       op1[1] = zero;
   2377    }
   2378    Value *op2[2];
   2379    if (i->getSrc(1)->reg.size == 8)
   2380       bld.mkSplit(op2, 4, i->getSrc(1));
   2381    else {
   2382       op2[0] = i->getSrc(1);
   2383       op2[1] = zero;
   2384    }
   2385 
   2386    Value *op3[2] = { NULL, NULL };
   2387    if (i->op == OP_MAD) {
   2388       if (i->getSrc(2)->reg.size == 8)
   2389          bld.mkSplit(op3, 4, i->getSrc(2));
   2390       else {
   2391          op3[0] = i->getSrc(2);
   2392          op3[1] = zero;
   2393       }
   2394    }
   2395 
   2396    Value *tmpRes1Hi = bld.getSSA();
   2397    if (i->op == OP_MAD)
   2398       bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
   2399    else
   2400       bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
   2401 
   2402    Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
   2403 
   2404    Value *def[2] = { bld.getSSA(), bld.getSSA() };
   2405 
   2406    // If it was a MAD, add the carry from the low bits
   2407    // It is not needed if it was a MUL, since we added high(a.low * b.low) to
   2408    // d.high
   2409    if (i->op == OP_MAD)
   2410       bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
   2411    else
   2412       bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
   2413 
   2414    Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
   2415    hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
   2416    if (i->op == OP_MAD)
   2417       hiPart3->setFlagsSrc(3, carry);
   2418 
   2419    bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
   2420 
   2421    delete_Instruction(fn->getProgram(), i);
   2422 }
   2423 
   2424 // =============================================================================
   2425 
   2426 static inline void
   2427 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
   2428 {
   2429    if (offset != ldst->getSrc(0)->reg.data.offset) {
   2430       if (ldst->getSrc(0)->refCount() > 1)
   2431          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
   2432       ldst->getSrc(0)->reg.data.offset = offset;
   2433    }
   2434 }
   2435 
   2436 // Combine loads and stores, forward stores to loads where possible.
   2437 class MemoryOpt : public Pass
   2438 {
   2439 private:
   2440    class Record
   2441    {
   2442    public:
   2443       Record *next;
   2444       Instruction *insn;
   2445       const Value *rel[2];
   2446       const Value *base;
   2447       int32_t offset;
   2448       int8_t fileIndex;
   2449       uint8_t size;
   2450       bool locked;
   2451       Record *prev;
   2452 
   2453       bool overlaps(const Instruction *ldst) const;
   2454 
   2455       inline void link(Record **);
   2456       inline void unlink(Record **);
   2457       inline void set(const Instruction *ldst);
   2458    };
   2459 
   2460 public:
   2461    MemoryOpt();
   2462 
   2463    Record *loads[DATA_FILE_COUNT];
   2464    Record *stores[DATA_FILE_COUNT];
   2465 
   2466    MemoryPool recordPool;
   2467 
   2468 private:
   2469    virtual bool visit(BasicBlock *);
   2470    bool runOpt(BasicBlock *);
   2471 
   2472    Record **getList(const Instruction *);
   2473 
   2474    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
   2475 
   2476    // merge @insn into load/store instruction from @rec
   2477    bool combineLd(Record *rec, Instruction *ld);
   2478    bool combineSt(Record *rec, Instruction *st);
   2479 
   2480    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
   2481    bool replaceLdFromSt(Instruction *ld, Record *stRec);
   2482    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
   2483 
   2484    void addRecord(Instruction *ldst);
   2485    void purgeRecords(Instruction *const st, DataFile);
   2486    void lockStores(Instruction *const ld);
   2487    void reset();
   2488 
   2489 private:
   2490    Record *prevRecord;
   2491 };
   2492 
   2493 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
   2494 {
   2495    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
   2496       loads[i] = NULL;
   2497       stores[i] = NULL;
   2498    }
   2499    prevRecord = NULL;
   2500 }
   2501 
   2502 void
   2503 MemoryOpt::reset()
   2504 {
   2505    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
   2506       Record *it, *next;
   2507       for (it = loads[i]; it; it = next) {
   2508          next = it->next;
   2509          recordPool.release(it);
   2510       }
   2511       loads[i] = NULL;
   2512       for (it = stores[i]; it; it = next) {
   2513          next = it->next;
   2514          recordPool.release(it);
   2515       }
   2516       stores[i] = NULL;
   2517    }
   2518 }
   2519 
   2520 bool
   2521 MemoryOpt::combineLd(Record *rec, Instruction *ld)
   2522 {
   2523    int32_t offRc = rec->offset;
   2524    int32_t offLd = ld->getSrc(0)->reg.data.offset;
   2525    int sizeRc = rec->size;
   2526    int sizeLd = typeSizeof(ld->dType);
   2527    int size = sizeRc + sizeLd;
   2528    int d, j;
   2529 
   2530    if (!prog->getTarget()->
   2531        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
   2532       return false;
   2533    // no unaligned loads
   2534    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
   2535        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
   2536       return false;
   2537    // for compute indirect loads are not guaranteed to be aligned
   2538    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
   2539       return false;
   2540 
   2541    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
   2542 
   2543    // lock any stores that overlap with the load being merged into the
   2544    // existing record.
   2545    lockStores(ld);
   2546 
   2547    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
   2548 
   2549    if (offLd < offRc) {
   2550       int sz;
   2551       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
   2552       // d: nr of definitions in ld
   2553       // j: nr of definitions in rec->insn, move:
   2554       for (d = d + j - 1; j > 0; --j, --d)
   2555          rec->insn->setDef(d, rec->insn->getDef(j - 1));
   2556 
   2557       if (rec->insn->getSrc(0)->refCount() > 1)
   2558          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
   2559       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
   2560 
   2561       d = 0;
   2562    } else {
   2563       d = j;
   2564    }
   2565    // move definitions of @ld to @rec->insn
   2566    for (j = 0; sizeLd; ++j, ++d) {
   2567       sizeLd -= ld->getDef(j)->reg.size;
   2568       rec->insn->setDef(d, ld->getDef(j));
   2569    }
   2570 
   2571    rec->size = size;
   2572    rec->insn->getSrc(0)->reg.size = size;
   2573    rec->insn->setType(typeOfSize(size));
   2574 
   2575    delete_Instruction(prog, ld);
   2576 
   2577    return true;
   2578 }
   2579 
   2580 bool
   2581 MemoryOpt::combineSt(Record *rec, Instruction *st)
   2582 {
   2583    int32_t offRc = rec->offset;
   2584    int32_t offSt = st->getSrc(0)->reg.data.offset;
   2585    int sizeRc = rec->size;
   2586    int sizeSt = typeSizeof(st->dType);
   2587    int s = sizeSt / 4;
   2588    int size = sizeRc + sizeSt;
   2589    int j, k;
   2590    Value *src[4]; // no modifiers in ValueRef allowed for st
   2591    Value *extra[3];
   2592 
   2593    if (!prog->getTarget()->
   2594        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
   2595       return false;
   2596    // no unaligned stores
   2597    if (size == 8 && MIN2(offRc, offSt) & 0x7)
   2598       return false;
   2599    // for compute indirect stores are not guaranteed to be aligned
   2600    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
   2601       return false;
   2602 
   2603    // remove any existing load/store records for the store being merged into
   2604    // the existing record.
   2605    purgeRecords(st, DATA_FILE_COUNT);
   2606 
   2607    st->takeExtraSources(0, extra); // save predicate and indirect address
   2608 
   2609    if (offRc < offSt) {
   2610       // save values from @st
   2611       for (s = 0; sizeSt; ++s) {
   2612          sizeSt -= st->getSrc(s + 1)->reg.size;
   2613          src[s] = st->getSrc(s + 1);
   2614       }
   2615       // set record's values as low sources of @st
   2616       for (j = 1; sizeRc; ++j) {
   2617          sizeRc -= rec->insn->getSrc(j)->reg.size;
   2618          st->setSrc(j, rec->insn->getSrc(j));
   2619       }
   2620       // set saved values as high sources of @st
   2621       for (k = j, j = 0; j < s; ++j)
   2622          st->setSrc(k++, src[j]);
   2623 
   2624       updateLdStOffset(st, offRc, func);
   2625    } else {
   2626       for (j = 1; sizeSt; ++j)
   2627          sizeSt -= st->getSrc(j)->reg.size;
   2628       for (s = 1; sizeRc; ++j, ++s) {
   2629          sizeRc -= rec->insn->getSrc(s)->reg.size;
   2630          st->setSrc(j, rec->insn->getSrc(s));
   2631       }
   2632       rec->offset = offSt;
   2633    }
   2634    st->putExtraSources(0, extra); // restore pointer and predicate
   2635 
   2636    delete_Instruction(prog, rec->insn);
   2637    rec->insn = st;
   2638    rec->size = size;
   2639    rec->insn->getSrc(0)->reg.size = size;
   2640    rec->insn->setType(typeOfSize(size));
   2641    return true;
   2642 }
   2643 
   2644 void
   2645 MemoryOpt::Record::set(const Instruction *ldst)
   2646 {
   2647    const Symbol *mem = ldst->getSrc(0)->asSym();
   2648    fileIndex = mem->reg.fileIndex;
   2649    rel[0] = ldst->getIndirect(0, 0);
   2650    rel[1] = ldst->getIndirect(0, 1);
   2651    offset = mem->reg.data.offset;
   2652    base = mem->getBase();
   2653    size = typeSizeof(ldst->sType);
   2654 }
   2655 
   2656 void
   2657 MemoryOpt::Record::link(Record **list)
   2658 {
   2659    next = *list;
   2660    if (next)
   2661       next->prev = this;
   2662    prev = NULL;
   2663    *list = this;
   2664 }
   2665 
   2666 void
   2667 MemoryOpt::Record::unlink(Record **list)
   2668 {
   2669    if (next)
   2670       next->prev = prev;
   2671    if (prev)
   2672       prev->next = next;
   2673    else
   2674       *list = next;
   2675 }
   2676 
   2677 MemoryOpt::Record **
   2678 MemoryOpt::getList(const Instruction *insn)
   2679 {
   2680    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
   2681       return &loads[insn->src(0).getFile()];
   2682    return &stores[insn->src(0).getFile()];
   2683 }
   2684 
   2685 void
   2686 MemoryOpt::addRecord(Instruction *i)
   2687 {
   2688    Record **list = getList(i);
   2689    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
   2690 
   2691    it->link(list);
   2692    it->set(i);
   2693    it->insn = i;
   2694    it->locked = false;
   2695 }
   2696 
   2697 MemoryOpt::Record *
   2698 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
   2699 {
   2700    const Symbol *sym = insn->getSrc(0)->asSym();
   2701    const int size = typeSizeof(insn->sType);
   2702    Record *rec = NULL;
   2703    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
   2704 
   2705    for (; it; it = it->next) {
   2706       if (it->locked && insn->op != OP_LOAD && insn->op != OP_VFETCH)
   2707          continue;
   2708       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
   2709           it->rel[0] != insn->getIndirect(0, 0) ||
   2710           it->fileIndex != sym->reg.fileIndex ||
   2711           it->rel[1] != insn->getIndirect(0, 1))
   2712          continue;
   2713 
   2714       if (it->offset < sym->reg.data.offset) {
   2715          if (it->offset + it->size >= sym->reg.data.offset) {
   2716             isAdj = (it->offset + it->size == sym->reg.data.offset);
   2717             if (!isAdj)
   2718                return it;
   2719             if (!(it->offset & 0x7))
   2720                rec = it;
   2721          }
   2722       } else {
   2723          isAdj = it->offset != sym->reg.data.offset;
   2724          if (size <= it->size && !isAdj)
   2725             return it;
   2726          else
   2727          if (!(sym->reg.data.offset & 0x7))
   2728             if (it->offset - size <= sym->reg.data.offset)
   2729                rec = it;
   2730       }
   2731    }
   2732    return rec;
   2733 }
   2734 
   2735 bool
   2736 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
   2737 {
   2738    Instruction *st = rec->insn;
   2739    int32_t offSt = rec->offset;
   2740    int32_t offLd = ld->getSrc(0)->reg.data.offset;
   2741    int d, s;
   2742 
   2743    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
   2744       offSt += st->getSrc(s)->reg.size;
   2745    if (offSt != offLd)
   2746       return false;
   2747 
   2748    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
   2749       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
   2750          return false;
   2751       if (st->getSrc(s)->reg.file != FILE_GPR)
   2752          return false;
   2753       ld->def(d).replace(st->src(s), false);
   2754    }
   2755    ld->bb->remove(ld);
   2756    return true;
   2757 }
   2758 
   2759 bool
   2760 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
   2761 {
   2762    Instruction *ldR = rec->insn;
   2763    int32_t offR = rec->offset;
   2764    int32_t offE = ldE->getSrc(0)->reg.data.offset;
   2765    int dR, dE;
   2766 
   2767    assert(offR <= offE);
   2768    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
   2769       offR += ldR->getDef(dR)->reg.size;
   2770    if (offR != offE)
   2771       return false;
   2772 
   2773    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
   2774       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
   2775          return false;
   2776       ldE->def(dE).replace(ldR->getDef(dR), false);
   2777    }
   2778 
   2779    delete_Instruction(prog, ldE);
   2780    return true;
   2781 }
   2782 
   2783 bool
   2784 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
   2785 {
   2786    const Instruction *const ri = rec->insn;
   2787    Value *extra[3];
   2788 
   2789    int32_t offS = st->getSrc(0)->reg.data.offset;
   2790    int32_t offR = rec->offset;
   2791    int32_t endS = offS + typeSizeof(st->dType);
   2792    int32_t endR = offR + typeSizeof(ri->dType);
   2793 
   2794    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
   2795 
   2796    st->takeExtraSources(0, extra);
   2797 
   2798    if (offR < offS) {
   2799       Value *vals[10];
   2800       int s, n;
   2801       int k = 0;
   2802       // get non-replaced sources of ri
   2803       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
   2804          vals[k++] = ri->getSrc(s);
   2805       n = s;
   2806       // get replaced sources of st
   2807       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
   2808          vals[k++] = st->getSrc(s);
   2809       // skip replaced sources of ri
   2810       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
   2811       // get non-replaced sources after values covered by st
   2812       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
   2813          vals[k++] = ri->getSrc(s);
   2814       assert((unsigned int)k <= ARRAY_SIZE(vals));
   2815       for (s = 0; s < k; ++s)
   2816          st->setSrc(s + 1, vals[s]);
   2817       st->setSrc(0, ri->getSrc(0));
   2818    } else
   2819    if (endR > endS) {
   2820       int j, s;
   2821       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
   2822       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
   2823       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
   2824          st->setSrc(s++, ri->getSrc(j));
   2825    }
   2826    st->putExtraSources(0, extra);
   2827 
   2828    delete_Instruction(prog, rec->insn);
   2829 
   2830    rec->insn = st;
   2831    rec->offset = st->getSrc(0)->reg.data.offset;
   2832 
   2833    st->setType(typeOfSize(rec->size));
   2834 
   2835    return true;
   2836 }
   2837 
   2838 bool
   2839 MemoryOpt::Record::overlaps(const Instruction *ldst) const
   2840 {
   2841    Record that;
   2842    that.set(ldst);
   2843 
   2844    // This assumes that images/buffers can't overlap. They can.
   2845    // TODO: Plumb the restrict logic through, and only skip when it's a
   2846    // restrict situation, or there can implicitly be no writes.
   2847    if (this->fileIndex != that.fileIndex && this->rel[1] == that.rel[1])
   2848       return false;
   2849 
   2850    if (this->rel[0] || that.rel[0])
   2851       return this->base == that.base;
   2852 
   2853    return
   2854       (this->offset < that.offset + that.size) &&
   2855       (this->offset + this->size > that.offset);
   2856 }
   2857 
   2858 // We must not eliminate stores that affect the result of @ld if
   2859 // we find later stores to the same location, and we may no longer
   2860 // merge them with later stores.
   2861 // The stored value can, however, still be used to determine the value
   2862 // returned by future loads.
   2863 void
   2864 MemoryOpt::lockStores(Instruction *const ld)
   2865 {
   2866    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
   2867       if (!r->locked && r->overlaps(ld))
   2868          r->locked = true;
   2869 }
   2870 
   2871 // Prior loads from the location of @st are no longer valid.
   2872 // Stores to the location of @st may no longer be used to derive
   2873 // the value at it nor be coalesced into later stores.
   2874 void
   2875 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
   2876 {
   2877    if (st)
   2878       f = st->src(0).getFile();
   2879 
   2880    for (Record *r = loads[f]; r; r = r->next)
   2881       if (!st || r->overlaps(st))
   2882          r->unlink(&loads[f]);
   2883 
   2884    for (Record *r = stores[f]; r; r = r->next)
   2885       if (!st || r->overlaps(st))
   2886          r->unlink(&stores[f]);
   2887 }
   2888 
   2889 bool
   2890 MemoryOpt::visit(BasicBlock *bb)
   2891 {
   2892    bool ret = runOpt(bb);
   2893    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
   2894    // where 96 bit memory operations are forbidden.
   2895    if (ret)
   2896       ret = runOpt(bb);
   2897    return ret;
   2898 }
   2899 
   2900 bool
   2901 MemoryOpt::runOpt(BasicBlock *bb)
   2902 {
   2903    Instruction *ldst, *next;
   2904    Record *rec;
   2905    bool isAdjacent = true;
   2906 
   2907    for (ldst = bb->getEntry(); ldst; ldst = next) {
   2908       bool keep = true;
   2909       bool isLoad = true;
   2910       next = ldst->next;
   2911 
   2912       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
   2913          if (ldst->isDead()) {
   2914             // might have been produced by earlier optimization
   2915             delete_Instruction(prog, ldst);
   2916             continue;
   2917          }
   2918       } else
   2919       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
   2920          if (typeSizeof(ldst->dType) == 4 &&
   2921              ldst->src(1).getFile() == FILE_GPR &&
   2922              ldst->getSrc(1)->getInsn()->op == OP_NOP) {
   2923             delete_Instruction(prog, ldst);
   2924             continue;
   2925          }
   2926          isLoad = false;
   2927       } else {
   2928          // TODO: maybe have all fixed ops act as barrier ?
   2929          if (ldst->op == OP_CALL ||
   2930              ldst->op == OP_BAR ||
   2931              ldst->op == OP_MEMBAR) {
   2932             purgeRecords(NULL, FILE_MEMORY_LOCAL);
   2933             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
   2934             purgeRecords(NULL, FILE_MEMORY_SHARED);
   2935             purgeRecords(NULL, FILE_SHADER_OUTPUT);
   2936          } else
   2937          if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
   2938             if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
   2939                purgeRecords(NULL, FILE_MEMORY_LOCAL);
   2940                purgeRecords(NULL, FILE_MEMORY_GLOBAL);
   2941                purgeRecords(NULL, FILE_MEMORY_SHARED);
   2942             } else {
   2943                purgeRecords(NULL, ldst->src(0).getFile());
   2944             }
   2945          } else
   2946          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
   2947             purgeRecords(NULL, FILE_SHADER_OUTPUT);
   2948          }
   2949          continue;
   2950       }
   2951       if (ldst->getPredicate()) // TODO: handle predicated ld/st
   2952          continue;
   2953       if (ldst->perPatch) // TODO: create separate per-patch lists
   2954          continue;
   2955 
   2956       if (isLoad) {
   2957          DataFile file = ldst->src(0).getFile();
   2958 
   2959          // if ld l[]/g[] look for previous store to eliminate the reload
   2960          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
   2961             // TODO: shared memory ?
   2962             rec = findRecord(ldst, false, isAdjacent);
   2963             if (rec && !isAdjacent)
   2964                keep = !replaceLdFromSt(ldst, rec);
   2965          }
   2966 
   2967          // or look for ld from the same location and replace this one
   2968          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
   2969          if (rec) {
   2970             if (!isAdjacent)
   2971                keep = !replaceLdFromLd(ldst, rec);
   2972             else
   2973                // or combine a previous load with this one
   2974                keep = !combineLd(rec, ldst);
   2975          }
   2976          if (keep)
   2977             lockStores(ldst);
   2978       } else {
   2979          rec = findRecord(ldst, false, isAdjacent);
   2980          if (rec) {
   2981             if (!isAdjacent)
   2982                keep = !replaceStFromSt(ldst, rec);
   2983             else
   2984                keep = !combineSt(rec, ldst);
   2985          }
   2986          if (keep)
   2987             purgeRecords(ldst, DATA_FILE_COUNT);
   2988       }
   2989       if (keep)
   2990          addRecord(ldst);
   2991    }
   2992    reset();
   2993 
   2994    return true;
   2995 }
   2996 
   2997 // =============================================================================
   2998 
   2999 // Turn control flow into predicated instructions (after register allocation !).
   3000 // TODO:
   3001 // Could move this to before register allocation on NVC0 and also handle nested
   3002 // constructs.
   3003 class FlatteningPass : public Pass
   3004 {
   3005 private:
   3006    virtual bool visit(Function *);
   3007    virtual bool visit(BasicBlock *);
   3008 
   3009    bool tryPredicateConditional(BasicBlock *);
   3010    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
   3011    void tryPropagateBranch(BasicBlock *);
   3012    inline bool isConstantCondition(Value *pred);
   3013    inline bool mayPredicate(const Instruction *, const Value *pred) const;
   3014    inline void removeFlow(Instruction *);
   3015 
   3016    uint8_t gpr_unit;
   3017 };
   3018 
   3019 bool
   3020 FlatteningPass::isConstantCondition(Value *pred)
   3021 {
   3022    Instruction *insn = pred->getUniqueInsn();
   3023    assert(insn);
   3024    if (insn->op != OP_SET || insn->srcExists(2))
   3025       return false;
   3026 
   3027    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
   3028       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
   3029       DataFile file;
   3030       if (ld) {
   3031          if (ld->op != OP_MOV && ld->op != OP_LOAD)
   3032             return false;
   3033          if (ld->src(0).isIndirect(0))
   3034             return false;
   3035          file = ld->src(0).getFile();
   3036       } else {
   3037          file = insn->src(s).getFile();
   3038          // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
   3039          // in register "units", which can vary between targets.
   3040          if (file == FILE_GPR) {
   3041             Value *v = insn->getSrc(s);
   3042             int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
   3043             int units = bytes >> gpr_unit;
   3044             if (units > prog->maxGPR)
   3045                file = FILE_IMMEDIATE;
   3046          }
   3047       }
   3048       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
   3049          return false;
   3050    }
   3051    return true;
   3052 }
   3053 
   3054 void
   3055 FlatteningPass::removeFlow(Instruction *insn)
   3056 {
   3057    FlowInstruction *term = insn ? insn->asFlow() : NULL;
   3058    if (!term)
   3059       return;
   3060    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
   3061 
   3062    if (term->op == OP_BRA) {
   3063       // TODO: this might get more difficult when we get arbitrary BRAs
   3064       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
   3065          return;
   3066    } else
   3067    if (term->op != OP_JOIN)
   3068       return;
   3069 
   3070    Value *pred = term->getPredicate();
   3071 
   3072    delete_Instruction(prog, term);
   3073 
   3074    if (pred && pred->refCount() == 0) {
   3075       Instruction *pSet = pred->getUniqueInsn();
   3076       pred->join->reg.data.id = -1; // deallocate
   3077       if (pSet->isDead())
   3078          delete_Instruction(prog, pSet);
   3079    }
   3080 }
   3081 
   3082 void
   3083 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
   3084 {
   3085    for (Instruction *i = bb->getEntry(); i; i = i->next) {
   3086       if (i->isNop())
   3087          continue;
   3088       assert(!i->getPredicate());
   3089       i->setPredicate(cc, pred);
   3090    }
   3091    removeFlow(bb->getExit());
   3092 }
   3093 
   3094 bool
   3095 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
   3096 {
   3097    if (insn->isPseudo())
   3098       return true;
   3099    // TODO: calls where we don't know which registers are modified
   3100 
   3101    if (!prog->getTarget()->mayPredicate(insn, pred))
   3102       return false;
   3103    for (int d = 0; insn->defExists(d); ++d)
   3104       if (insn->getDef(d)->equals(pred))
   3105          return false;
   3106    return true;
   3107 }
   3108 
   3109 // If we jump to BRA/RET/EXIT, replace the jump with it.
   3110 // NOTE: We do not update the CFG anymore here !
   3111 //
   3112 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
   3113 //  BB:0
   3114 //   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
   3115 //  BB1:
   3116 //   bra BB:3
   3117 //  BB2:
   3118 //   ...
   3119 //  BB3:
   3120 //   ...
   3121 void
   3122 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
   3123 {
   3124    for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
   3125       BasicBlock *bf = i->asFlow()->target.bb;
   3126 
   3127       if (bf->getInsnCount() != 1)
   3128          continue;
   3129 
   3130       FlowInstruction *bra = i->asFlow();
   3131       FlowInstruction *rep = bf->getExit()->asFlow();
   3132 
   3133       if (!rep || rep->getPredicate())
   3134          continue;
   3135       if (rep->op != OP_BRA &&
   3136           rep->op != OP_JOIN &&
   3137           rep->op != OP_EXIT)
   3138          continue;
   3139 
   3140       // TODO: If there are multiple branches to @rep, only the first would
   3141       // be replaced, so only remove them after this pass is done ?
   3142       // Also, need to check all incident blocks for fall-through exits and
   3143       // add the branch there.
   3144       bra->op = rep->op;
   3145       bra->target.bb = rep->target.bb;
   3146       if (bf->cfg.incidentCount() == 1)
   3147          bf->remove(rep);
   3148    }
   3149 }
   3150 
   3151 bool
   3152 FlatteningPass::visit(Function *fn)
   3153 {
   3154    gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
   3155 
   3156    return true;
   3157 }
   3158 
   3159 bool
   3160 FlatteningPass::visit(BasicBlock *bb)
   3161 {
   3162    if (tryPredicateConditional(bb))
   3163       return true;
   3164 
   3165    // try to attach join to previous instruction
   3166    if (prog->getTarget()->hasJoin) {
   3167       Instruction *insn = bb->getExit();
   3168       if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
   3169          insn = insn->prev;
   3170          if (insn && !insn->getPredicate() &&
   3171              !insn->asFlow() &&
   3172              insn->op != OP_DISCARD &&
   3173              insn->op != OP_TEXBAR &&
   3174              !isTextureOp(insn->op) && // probably just nve4
   3175              !isSurfaceOp(insn->op) && // not confirmed
   3176              insn->op != OP_LINTERP && // probably just nve4
   3177              insn->op != OP_PINTERP && // probably just nve4
   3178              ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
   3179               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
   3180              !insn->isNop()) {
   3181             insn->join = 1;
   3182             bb->remove(bb->getExit());
   3183             return true;
   3184          }
   3185       }
   3186    }
   3187 
   3188    tryPropagateBranch(bb);
   3189 
   3190    return true;
   3191 }
   3192 
   3193 bool
   3194 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
   3195 {
   3196    BasicBlock *bL = NULL, *bR = NULL;
   3197    unsigned int nL = 0, nR = 0, limit = 12;
   3198    Instruction *insn;
   3199    unsigned int mask;
   3200 
   3201    mask = bb->initiatesSimpleConditional();
   3202    if (!mask)
   3203       return false;
   3204 
   3205    assert(bb->getExit());
   3206    Value *pred = bb->getExit()->getPredicate();
   3207    assert(pred);
   3208 
   3209    if (isConstantCondition(pred))
   3210       limit = 4;
   3211 
   3212    Graph::EdgeIterator ei = bb->cfg.outgoing();
   3213 
   3214    if (mask & 1) {
   3215       bL = BasicBlock::get(ei.getNode());
   3216       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
   3217          if (!mayPredicate(insn, pred))
   3218             return false;
   3219       if (nL > limit)
   3220          return false; // too long, do a real branch
   3221    }
   3222    ei.next();
   3223 
   3224    if (mask & 2) {
   3225       bR = BasicBlock::get(ei.getNode());
   3226       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
   3227          if (!mayPredicate(insn, pred))
   3228             return false;
   3229       if (nR > limit)
   3230          return false; // too long, do a real branch
   3231    }
   3232 
   3233    if (bL)
   3234       predicateInstructions(bL, pred, bb->getExit()->cc);
   3235    if (bR)
   3236       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
   3237 
   3238    if (bb->joinAt) {
   3239       bb->remove(bb->joinAt);
   3240       bb->joinAt = NULL;
   3241    }
   3242    removeFlow(bb->getExit()); // delete the branch/join at the fork point
   3243 
   3244    // remove potential join operations at the end of the conditional
   3245    if (prog->getTarget()->joinAnterior) {
   3246       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
   3247       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
   3248          removeFlow(bb->getEntry());
   3249    }
   3250 
   3251    return true;
   3252 }
   3253 
   3254 // =============================================================================
   3255 
   3256 // Fold Immediate into MAD; must be done after register allocation due to
   3257 // constraint SDST == SSRC2
   3258 // TODO:
   3259 // Does NVC0+ have other situations where this pass makes sense?
   3260 class PostRaLoadPropagation : public Pass
   3261 {
   3262 private:
   3263    virtual bool visit(Instruction *);
   3264 
   3265    void handleMADforNV50(Instruction *);
   3266    void handleMADforNVC0(Instruction *);
   3267 };
   3268 
   3269 static bool
   3270 post_ra_dead(Instruction *i)
   3271 {
   3272    for (int d = 0; i->defExists(d); ++d)
   3273       if (i->getDef(d)->refCount())
   3274          return false;
   3275    return true;
   3276 }
   3277 
   3278 // Fold Immediate into MAD; must be done after register allocation due to
   3279 // constraint SDST == SSRC2
   3280 void
   3281 PostRaLoadPropagation::handleMADforNV50(Instruction *i)
   3282 {
   3283    if (i->def(0).getFile() != FILE_GPR ||
   3284        i->src(0).getFile() != FILE_GPR ||
   3285        i->src(1).getFile() != FILE_GPR ||
   3286        i->src(2).getFile() != FILE_GPR ||
   3287        i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
   3288       return;
   3289 
   3290    if (i->getDef(0)->reg.data.id >= 64 ||
   3291        i->getSrc(0)->reg.data.id >= 64)
   3292       return;
   3293 
   3294    if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
   3295       return;
   3296 
   3297    if (i->getPredicate())
   3298       return;
   3299 
   3300    Value *vtmp;
   3301    Instruction *def = i->getSrc(1)->getInsn();
   3302 
   3303    if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
   3304       def = def->getSrc(0)->getInsn();
   3305    if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
   3306       vtmp = i->getSrc(1);
   3307       if (isFloatType(i->sType)) {
   3308          i->setSrc(1, def->getSrc(0));
   3309       } else {
   3310          ImmediateValue val;
   3311          // getImmediate() has side-effects on the argument so this *shouldn't*
   3312          // be folded into the assert()
   3313          MAYBE_UNUSED bool ret = def->src(0).getImmediate(val);
   3314          assert(ret);
   3315          if (i->getSrc(1)->reg.data.id & 1)
   3316             val.reg.data.u32 >>= 16;
   3317          val.reg.data.u32 &= 0xffff;
   3318          i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32));
   3319       }
   3320 
   3321       /* There's no post-RA dead code elimination, so do it here
   3322        * XXX: if we add more code-removing post-RA passes, we might
   3323        *      want to create a post-RA dead-code elim pass */
   3324       if (post_ra_dead(vtmp->getInsn())) {
   3325          Value *src = vtmp->getInsn()->getSrc(0);
   3326          // Careful -- splits will have already been removed from the
   3327          // functions. Don't double-delete.
   3328          if (vtmp->getInsn()->bb)
   3329             delete_Instruction(prog, vtmp->getInsn());
   3330          if (src->getInsn() && post_ra_dead(src->getInsn()))
   3331             delete_Instruction(prog, src->getInsn());
   3332       }
   3333    }
   3334 }
   3335 
   3336 void
   3337 PostRaLoadPropagation::handleMADforNVC0(Instruction *i)
   3338 {
   3339    if (i->def(0).getFile() != FILE_GPR ||
   3340        i->src(0).getFile() != FILE_GPR ||
   3341        i->src(1).getFile() != FILE_GPR ||
   3342        i->src(2).getFile() != FILE_GPR ||
   3343        i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
   3344       return;
   3345 
   3346    // TODO: gm107 can also do this for S32, maybe other chipsets as well
   3347    if (i->dType != TYPE_F32)
   3348       return;
   3349 
   3350    if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
   3351       return;
   3352 
   3353    ImmediateValue val;
   3354    int s;
   3355 
   3356    if (i->src(0).getImmediate(val))
   3357       s = 1;
   3358    else if (i->src(1).getImmediate(val))
   3359       s = 0;
   3360    else
   3361       return;
   3362 
   3363    if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
   3364       return;
   3365 
   3366    if (s == 1)
   3367       i->swapSources(0, 1);
   3368 
   3369    Instruction *imm = i->getSrc(1)->getInsn();
   3370    i->setSrc(1, imm->getSrc(0));
   3371    if (post_ra_dead(imm))
   3372       delete_Instruction(prog, imm);
   3373 }
   3374 
   3375 bool
   3376 PostRaLoadPropagation::visit(Instruction *i)
   3377 {
   3378    switch (i->op) {
   3379    case OP_FMA:
   3380    case OP_MAD:
   3381       if (prog->getTarget()->getChipset() < 0xc0)
   3382          handleMADforNV50(i);
   3383       else
   3384          handleMADforNVC0(i);
   3385       break;
   3386    default:
   3387       break;
   3388    }
   3389 
   3390    return true;
   3391 }
   3392 
   3393 // =============================================================================
   3394 
   3395 // Common subexpression elimination. Stupid O^2 implementation.
   3396 class LocalCSE : public Pass
   3397 {
   3398 private:
   3399    virtual bool visit(BasicBlock *);
   3400 
   3401    inline bool tryReplace(Instruction **, Instruction *);
   3402 
   3403    DLList ops[OP_LAST + 1];
   3404 };
   3405 
   3406 class GlobalCSE : public Pass
   3407 {
   3408 private:
   3409    virtual bool visit(BasicBlock *);
   3410 };
   3411 
   3412 bool
   3413 Instruction::isActionEqual(const Instruction *that) const
   3414 {
   3415    if (this->op != that->op ||
   3416        this->dType != that->dType ||
   3417        this->sType != that->sType)
   3418       return false;
   3419    if (this->cc != that->cc)
   3420       return false;
   3421 
   3422    if (this->asTex()) {
   3423       if (memcmp(&this->asTex()->tex,
   3424                  &that->asTex()->tex,
   3425                  sizeof(this->asTex()->tex)))
   3426          return false;
   3427    } else
   3428    if (this->asCmp()) {
   3429       if (this->asCmp()->setCond != that->asCmp()->setCond)
   3430          return false;
   3431    } else
   3432    if (this->asFlow()) {
   3433       return false;
   3434    } else {
   3435       if (this->ipa != that->ipa ||
   3436           this->lanes != that->lanes ||
   3437           this->perPatch != that->perPatch)
   3438          return false;
   3439       if (this->postFactor != that->postFactor)
   3440          return false;
   3441    }
   3442 
   3443    if (this->subOp != that->subOp ||
   3444        this->saturate != that->saturate ||
   3445        this->rnd != that->rnd ||
   3446        this->ftz != that->ftz ||
   3447        this->dnz != that->dnz ||
   3448        this->cache != that->cache ||
   3449        this->mask != that->mask)
   3450       return false;
   3451 
   3452    return true;
   3453 }
   3454 
   3455 bool
   3456 Instruction::isResultEqual(const Instruction *that) const
   3457 {
   3458    unsigned int d, s;
   3459 
   3460    // NOTE: location of discard only affects tex with liveOnly and quadops
   3461    if (!this->defExists(0) && this->op != OP_DISCARD)
   3462       return false;
   3463 
   3464    if (!isActionEqual(that))
   3465       return false;
   3466 
   3467    if (this->predSrc != that->predSrc)
   3468       return false;
   3469 
   3470    for (d = 0; this->defExists(d); ++d) {
   3471       if (!that->defExists(d) ||
   3472           !this->getDef(d)->equals(that->getDef(d), false))
   3473          return false;
   3474    }
   3475    if (that->defExists(d))
   3476       return false;
   3477 
   3478    for (s = 0; this->srcExists(s); ++s) {
   3479       if (!that->srcExists(s))
   3480          return false;
   3481       if (this->src(s).mod != that->src(s).mod)
   3482          return false;
   3483       if (!this->getSrc(s)->equals(that->getSrc(s), true))
   3484          return false;
   3485    }
   3486    if (that->srcExists(s))
   3487       return false;
   3488 
   3489    if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
   3490       switch (src(0).getFile()) {
   3491       case FILE_MEMORY_CONST:
   3492       case FILE_SHADER_INPUT:
   3493          return true;
   3494       case FILE_SHADER_OUTPUT:
   3495          return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
   3496       default:
   3497          return false;
   3498       }
   3499    }
   3500 
   3501    return true;
   3502 }
   3503 
   3504 // pull through common expressions from different in-blocks
   3505 bool
   3506 GlobalCSE::visit(BasicBlock *bb)
   3507 {
   3508    Instruction *phi, *next, *ik;
   3509    int s;
   3510 
   3511    // TODO: maybe do this with OP_UNION, too
   3512 
   3513    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
   3514       next = phi->next;
   3515       if (phi->getSrc(0)->refCount() > 1)
   3516          continue;
   3517       ik = phi->getSrc(0)->getInsn();
   3518       if (!ik)
   3519          continue; // probably a function input
   3520       if (ik->defCount(0xff) > 1)
   3521          continue; // too painful to check if we can really push this forward
   3522       for (s = 1; phi->srcExists(s); ++s) {
   3523          if (phi->getSrc(s)->refCount() > 1)
   3524             break;
   3525          if (!phi->getSrc(s)->getInsn() ||
   3526              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
   3527             break;
   3528       }
   3529       if (!phi->srcExists(s)) {
   3530          Instruction *entry = bb->getEntry();
   3531          ik->bb->remove(ik);
   3532          if (!entry || entry->op != OP_JOIN)
   3533             bb->insertHead(ik);
   3534          else
   3535             bb->insertAfter(entry, ik);
   3536          ik->setDef(0, phi->getDef(0));
   3537          delete_Instruction(prog, phi);
   3538       }
   3539    }
   3540 
   3541    return true;
   3542 }
   3543 
   3544 bool
   3545 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
   3546 {
   3547    Instruction *old = *ptr;
   3548 
   3549    // TODO: maybe relax this later (causes trouble with OP_UNION)
   3550    if (i->isPredicated())
   3551       return false;
   3552 
   3553    if (!old->isResultEqual(i))
   3554       return false;
   3555 
   3556    for (int d = 0; old->defExists(d); ++d)
   3557       old->def(d).replace(i->getDef(d), false);
   3558    delete_Instruction(prog, old);
   3559    *ptr = NULL;
   3560    return true;
   3561 }
   3562 
   3563 bool
   3564 LocalCSE::visit(BasicBlock *bb)
   3565 {
   3566    unsigned int replaced;
   3567 
   3568    do {
   3569       Instruction *ir, *next;
   3570 
   3571       replaced = 0;
   3572 
   3573       // will need to know the order of instructions
   3574       int serial = 0;
   3575       for (ir = bb->getFirst(); ir; ir = ir->next)
   3576          ir->serial = serial++;
   3577 
   3578       for (ir = bb->getFirst(); ir; ir = next) {
   3579          int s;
   3580          Value *src = NULL;
   3581 
   3582          next = ir->next;
   3583 
   3584          if (ir->fixed) {
   3585             ops[ir->op].insert(ir);
   3586             continue;
   3587          }
   3588 
   3589          for (s = 0; ir->srcExists(s); ++s)
   3590             if (ir->getSrc(s)->asLValue())
   3591                if (!src || ir->getSrc(s)->refCount() < src->refCount())
   3592                   src = ir->getSrc(s);
   3593 
   3594          if (src) {
   3595             for (Value::UseIterator it = src->uses.begin();
   3596                  it != src->uses.end(); ++it) {
   3597                Instruction *ik = (*it)->getInsn();
   3598                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
   3599                   if (tryReplace(&ir, ik))
   3600                      break;
   3601             }
   3602          } else {
   3603             DLLIST_FOR_EACH(&ops[ir->op], iter)
   3604             {
   3605                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
   3606                if (tryReplace(&ir, ik))
   3607                   break;
   3608             }
   3609          }
   3610 
   3611          if (ir)
   3612             ops[ir->op].insert(ir);
   3613          else
   3614             ++replaced;
   3615       }
   3616       for (unsigned int i = 0; i <= OP_LAST; ++i)
   3617          ops[i].clear();
   3618 
   3619    } while (replaced);
   3620 
   3621    return true;
   3622 }
   3623 
   3624 // =============================================================================
   3625 
   3626 // Remove computations of unused values.
   3627 class DeadCodeElim : public Pass
   3628 {
   3629 public:
   3630    bool buryAll(Program *);
   3631 
   3632 private:
   3633    virtual bool visit(BasicBlock *);
   3634 
   3635    void checkSplitLoad(Instruction *ld); // for partially dead loads
   3636 
   3637    unsigned int deadCount;
   3638 };
   3639 
   3640 bool
   3641 DeadCodeElim::buryAll(Program *prog)
   3642 {
   3643    do {
   3644       deadCount = 0;
   3645       if (!this->run(prog, false, false))
   3646          return false;
   3647    } while (deadCount);
   3648 
   3649    return true;
   3650 }
   3651 
   3652 bool
   3653 DeadCodeElim::visit(BasicBlock *bb)
   3654 {
   3655    Instruction *prev;
   3656 
   3657    for (Instruction *i = bb->getExit(); i; i = prev) {
   3658       prev = i->prev;
   3659       if (i->isDead()) {
   3660          ++deadCount;
   3661          delete_Instruction(prog, i);
   3662       } else
   3663       if (i->defExists(1) &&
   3664           i->subOp == 0 &&
   3665           (i->op == OP_VFETCH || i->op == OP_LOAD)) {
   3666          checkSplitLoad(i);
   3667       } else
   3668       if (i->defExists(0) && !i->getDef(0)->refCount()) {
   3669          if (i->op == OP_ATOM ||
   3670              i->op == OP_SUREDP ||
   3671              i->op == OP_SUREDB) {
   3672             i->setDef(0, NULL);
   3673             if (i->op == OP_ATOM && i->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
   3674                i->cache = CACHE_CV;
   3675                i->op = OP_STORE;
   3676                i->subOp = 0;
   3677             }
   3678          } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
   3679             i->setDef(0, i->getDef(1));
   3680             i->setDef(1, NULL);
   3681          }
   3682       }
   3683    }
   3684    return true;
   3685 }
   3686 
   3687 // Each load can go into up to 4 destinations, any of which might potentially
   3688 // be dead (i.e. a hole). These can always be split into 2 loads, independent
   3689 // of where the holes are. We find the first contiguous region, put it into
   3690 // the first load, and then put the second contiguous region into the second
   3691 // load. There can be at most 2 contiguous regions.
   3692 //
   3693 // Note that there are some restrictions, for example it's not possible to do
   3694 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
   3695 // up. Also hardware doesn't support 96-bit loads, so those also have to be
   3696 // split into a 64-bit and 32-bit load.
   3697 void
   3698 DeadCodeElim::checkSplitLoad(Instruction *ld1)
   3699 {
   3700    Instruction *ld2 = NULL; // can get at most 2 loads
   3701    Value *def1[4];
   3702    Value *def2[4];
   3703    int32_t addr1, addr2;
   3704    int32_t size1, size2;
   3705    int d, n1, n2;
   3706    uint32_t mask = 0xffffffff;
   3707 
   3708    for (d = 0; ld1->defExists(d); ++d)
   3709       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
   3710          mask &= ~(1 << d);
   3711    if (mask == 0xffffffff)
   3712       return;
   3713 
   3714    addr1 = ld1->getSrc(0)->reg.data.offset;
   3715    n1 = n2 = 0;
   3716    size1 = size2 = 0;
   3717 
   3718    // Compute address/width for first load
   3719    for (d = 0; ld1->defExists(d); ++d) {
   3720       if (mask & (1 << d)) {
   3721          if (size1 && (addr1 & 0x7))
   3722             break;
   3723          def1[n1] = ld1->getDef(d);
   3724          size1 += def1[n1++]->reg.size;
   3725       } else
   3726       if (!n1) {
   3727          addr1 += ld1->getDef(d)->reg.size;
   3728       } else {
   3729          break;
   3730       }
   3731    }
   3732 
   3733    // Scale back the size of the first load until it can be loaded. This
   3734    // typically happens for TYPE_B96 loads.
   3735    while (n1 &&
   3736           !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
   3737                                                 typeOfSize(size1))) {
   3738       size1 -= def1[--n1]->reg.size;
   3739       d--;
   3740    }
   3741 
   3742    // Compute address/width for second load
   3743    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
   3744       if (mask & (1 << d)) {
   3745          assert(!size2 || !(addr2 & 0x7));
   3746          def2[n2] = ld1->getDef(d);
   3747          size2 += def2[n2++]->reg.size;
   3748       } else if (!n2) {
   3749          assert(!n2);
   3750          addr2 += ld1->getDef(d)->reg.size;
   3751       } else {
   3752          break;
   3753       }
   3754    }
   3755 
   3756    // Make sure that we've processed all the values
   3757    for (; ld1->defExists(d); ++d)
   3758       assert(!(mask & (1 << d)));
   3759 
   3760    updateLdStOffset(ld1, addr1, func);
   3761    ld1->setType(typeOfSize(size1));
   3762    for (d = 0; d < 4; ++d)
   3763       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
   3764 
   3765    if (!n2)
   3766       return;
   3767 
   3768    ld2 = cloneShallow(func, ld1);
   3769    updateLdStOffset(ld2, addr2, func);
   3770    ld2->setType(typeOfSize(size2));
   3771    for (d = 0; d < 4; ++d)
   3772       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
   3773 
   3774    ld1->bb->insertAfter(ld1, ld2);
   3775 }
   3776 
   3777 // =============================================================================
   3778 
   3779 #define RUN_PASS(l, n, f)                       \
   3780    if (level >= (l)) {                          \
   3781       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
   3782          INFO("PEEPHOLE: %s\n", #n);            \
   3783       n pass;                                   \
   3784       if (!pass.f(this))                        \
   3785          return false;                          \
   3786    }
   3787 
   3788 bool
   3789 Program::optimizeSSA(int level)
   3790 {
   3791    RUN_PASS(1, DeadCodeElim, buryAll);
   3792    RUN_PASS(1, CopyPropagation, run);
   3793    RUN_PASS(1, MergeSplits, run);
   3794    RUN_PASS(2, GlobalCSE, run);
   3795    RUN_PASS(1, LocalCSE, run);
   3796    RUN_PASS(2, AlgebraicOpt, run);
   3797    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
   3798    RUN_PASS(1, ConstantFolding, foldAll);
   3799    RUN_PASS(1, Split64BitOpPreRA, run);
   3800    RUN_PASS(1, LoadPropagation, run);
   3801    RUN_PASS(1, IndirectPropagation, run);
   3802    RUN_PASS(2, MemoryOpt, run);
   3803    RUN_PASS(2, LateAlgebraicOpt, run);
   3804    RUN_PASS(2, LocalCSE, run);
   3805    RUN_PASS(0, DeadCodeElim, buryAll);
   3806 
   3807    return true;
   3808 }
   3809 
   3810 bool
   3811 Program::optimizePostRA(int level)
   3812 {
   3813    RUN_PASS(2, FlatteningPass, run);
   3814    RUN_PASS(2, PostRaLoadPropagation, run);
   3815 
   3816    return true;
   3817 }
   3818 
   3819 }
   3820