Home | History | Annotate | Download | only in codegen
      1 /*
      2  * Copyright 2011 Christoph Bumiller
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "codegen/nv50_ir.h"
     24 #include "codegen/nv50_ir_target.h"
     25 #include "codegen/nv50_ir_build_util.h"
     26 
     27 extern "C" {
     28 #include "util/u_math.h"
     29 }
     30 
     31 namespace nv50_ir {
     32 
     33 bool
     34 Instruction::isNop() const
     35 {
     36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
     37       return true;
     38    if (terminator || join) // XXX: should terminator imply flow ?
     39       return false;
     40    if (op == OP_ATOM)
     41       return false;
     42    if (!fixed && op == OP_NOP)
     43       return true;
     44 
     45    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
     46       for (int d = 1; defExists(d); ++d)
     47          if (def(d).rep()->reg.data.id >= 0)
     48             WARN("part of vector result is unused !\n");
     49       return true;
     50    }
     51 
     52    if (op == OP_MOV || op == OP_UNION) {
     53       if (!getDef(0)->equals(getSrc(0)))
     54          return false;
     55       if (op == OP_UNION)
     56          if (!def(0).rep()->equals(getSrc(1)))
     57             return false;
     58       return true;
     59    }
     60 
     61    return false;
     62 }
     63 
     64 bool Instruction::isDead() const
     65 {
     66    if (op == OP_STORE ||
     67        op == OP_EXPORT ||
     68        op == OP_ATOM ||
     69        op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
     70        op == OP_WRSV)
     71       return false;
     72 
     73    for (int d = 0; defExists(d); ++d)
     74       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
     75          return false;
     76 
     77    if (terminator || asFlow())
     78       return false;
     79    if (fixed)
     80       return false;
     81 
     82    return true;
     83 };
     84 
     85 // =============================================================================
     86 
     87 class CopyPropagation : public Pass
     88 {
     89 private:
     90    virtual bool visit(BasicBlock *);
     91 };
     92 
     93 // Propagate all MOVs forward to make subsequent optimization easier, except if
     94 // the sources stem from a phi, in which case we don't want to mess up potential
     95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
     96 bool
     97 CopyPropagation::visit(BasicBlock *bb)
     98 {
     99    Instruction *mov, *si, *next;
    100 
    101    for (mov = bb->getEntry(); mov; mov = next) {
    102       next = mov->next;
    103       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
    104          continue;
    105       if (mov->getPredicate())
    106          continue;
    107       if (mov->def(0).getFile() != mov->src(0).getFile())
    108          continue;
    109       si = mov->getSrc(0)->getInsn();
    110       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
    111          // propagate
    112          mov->def(0).replace(mov->getSrc(0), false);
    113          delete_Instruction(prog, mov);
    114       }
    115    }
    116    return true;
    117 }
    118 
    119 // =============================================================================
    120 
    121 class MergeSplits : public Pass
    122 {
    123 private:
    124    virtual bool visit(BasicBlock *);
    125 };
    126 
    127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
    128 // post-merge def with the SPLIT's source.
    129 bool
    130 MergeSplits::visit(BasicBlock *bb)
    131 {
    132    Instruction *i, *next, *si;
    133 
    134    for (i = bb->getEntry(); i; i = next) {
    135       next = i->next;
    136       if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
    137          continue;
    138       si = i->getSrc(0)->getInsn();
    139       if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
    140          continue;
    141       i->def(0).replace(si->getSrc(0), false);
    142       delete_Instruction(prog, i);
    143    }
    144 
    145    return true;
    146 }
    147 
    148 // =============================================================================
    149 
    150 class LoadPropagation : public Pass
    151 {
    152 private:
    153    virtual bool visit(BasicBlock *);
    154 
    155    void checkSwapSrc01(Instruction *);
    156 
    157    bool isCSpaceLoad(Instruction *);
    158    bool isImmdLoad(Instruction *);
    159    bool isAttribOrSharedLoad(Instruction *);
    160 };
    161 
    162 bool
    163 LoadPropagation::isCSpaceLoad(Instruction *ld)
    164 {
    165    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
    166 }
    167 
    168 bool
    169 LoadPropagation::isImmdLoad(Instruction *ld)
    170 {
    171    if (!ld || (ld->op != OP_MOV) ||
    172        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
    173       return false;
    174 
    175    // A 0 can be replaced with a register, so it doesn't count as an immediate.
    176    ImmediateValue val;
    177    return ld->src(0).getImmediate(val) && !val.isInteger(0);
    178 }
    179 
    180 bool
    181 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
    182 {
    183    return ld &&
    184       (ld->op == OP_VFETCH ||
    185        (ld->op == OP_LOAD &&
    186         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
    187          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
    188 }
    189 
    190 void
    191 LoadPropagation::checkSwapSrc01(Instruction *insn)
    192 {
    193    const Target *targ = prog->getTarget();
    194    if (!targ->getOpInfo(insn).commutative)
    195       if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
    196          return;
    197    if (insn->src(1).getFile() != FILE_GPR)
    198       return;
    199    // This is the special OP_SET used for alphatesting, we can't reverse its
    200    // arguments as that will confuse the fixup code.
    201    if (insn->op == OP_SET && insn->subOp)
    202       return;
    203 
    204    Instruction *i0 = insn->getSrc(0)->getInsn();
    205    Instruction *i1 = insn->getSrc(1)->getInsn();
    206 
    207    // Swap sources to inline the less frequently used source. That way,
    208    // optimistically, it will eventually be able to remove the instruction.
    209    int i0refs = insn->getSrc(0)->refCount();
    210    int i1refs = insn->getSrc(1)->refCount();
    211 
    212    if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
    213       if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
    214           !targ->insnCanLoad(insn, 1, i1) ||
    215           i0refs < i1refs)
    216          insn->swapSources(0, 1);
    217       else
    218          return;
    219    } else
    220    if (isAttribOrSharedLoad(i1)) {
    221       if (!isAttribOrSharedLoad(i0))
    222          insn->swapSources(0, 1);
    223       else
    224          return;
    225    } else {
    226       return;
    227    }
    228 
    229    if (insn->op == OP_SET || insn->op == OP_SET_AND ||
    230        insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
    231       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
    232    else
    233    if (insn->op == OP_SLCT)
    234       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
    235    else
    236    if (insn->op == OP_SUB) {
    237       insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
    238       insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
    239    }
    240 }
    241 
    242 bool
    243 LoadPropagation::visit(BasicBlock *bb)
    244 {
    245    const Target *targ = prog->getTarget();
    246    Instruction *next;
    247 
    248    for (Instruction *i = bb->getEntry(); i; i = next) {
    249       next = i->next;
    250 
    251       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
    252          continue;
    253 
    254       if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
    255          continue;
    256 
    257       if (i->srcExists(1))
    258          checkSwapSrc01(i);
    259 
    260       for (int s = 0; i->srcExists(s); ++s) {
    261          Instruction *ld = i->getSrc(s)->getInsn();
    262 
    263          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
    264             continue;
    265          if (!targ->insnCanLoad(i, s, ld))
    266             continue;
    267 
    268          // propagate !
    269          i->setSrc(s, ld->getSrc(0));
    270          if (ld->src(0).isIndirect(0))
    271             i->setIndirect(s, 0, ld->getIndirect(0, 0));
    272 
    273          if (ld->getDef(0)->refCount() == 0)
    274             delete_Instruction(prog, ld);
    275       }
    276    }
    277    return true;
    278 }
    279 
    280 // =============================================================================
    281 
    282 class IndirectPropagation : public Pass
    283 {
    284 private:
    285    virtual bool visit(BasicBlock *);
    286 };
    287 
    288 bool
    289 IndirectPropagation::visit(BasicBlock *bb)
    290 {
    291    const Target *targ = prog->getTarget();
    292    Instruction *next;
    293 
    294    for (Instruction *i = bb->getEntry(); i; i = next) {
    295       next = i->next;
    296 
    297       for (int s = 0; i->srcExists(s); ++s) {
    298          Instruction *insn;
    299          ImmediateValue imm;
    300          if (!i->src(s).isIndirect(0))
    301             continue;
    302          insn = i->getIndirect(s, 0)->getInsn();
    303          if (!insn)
    304             continue;
    305          if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
    306             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
    307                 !insn->src(1).getImmediate(imm) ||
    308                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
    309                continue;
    310             i->setIndirect(s, 0, insn->getSrc(0));
    311             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
    312             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
    313          } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
    314             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
    315                 !insn->src(1).getImmediate(imm) ||
    316                 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
    317                continue;
    318             i->setIndirect(s, 0, insn->getSrc(0));
    319             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
    320             i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
    321          } else if (insn->op == OP_MOV) {
    322             if (!insn->src(0).getImmediate(imm) ||
    323                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
    324                continue;
    325             i->setIndirect(s, 0, NULL);
    326             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
    327             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
    328          }
    329       }
    330    }
    331    return true;
    332 }
    333 
    334 // =============================================================================
    335 
    336 // Evaluate constant expressions.
    337 class ConstantFolding : public Pass
    338 {
    339 public:
    340    bool foldAll(Program *);
    341 
    342 private:
    343    virtual bool visit(BasicBlock *);
    344 
    345    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
    346    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
    347    void opnd(Instruction *, ImmediateValue&, int s);
    348    void opnd3(Instruction *, ImmediateValue&);
    349 
    350    void unary(Instruction *, const ImmediateValue&);
    351 
    352    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
    353 
    354    CmpInstruction *findOriginForTestWithZero(Value *);
    355 
    356    unsigned int foldCount;
    357 
    358    BuildUtil bld;
    359 };
    360 
    361 // TODO: remember generated immediates and only revisit these
    362 bool
    363 ConstantFolding::foldAll(Program *prog)
    364 {
    365    unsigned int iterCount = 0;
    366    do {
    367       foldCount = 0;
    368       if (!run(prog))
    369          return false;
    370    } while (foldCount && ++iterCount < 2);
    371    return true;
    372 }
    373 
    374 bool
    375 ConstantFolding::visit(BasicBlock *bb)
    376 {
    377    Instruction *i, *next;
    378 
    379    for (i = bb->getEntry(); i; i = next) {
    380       next = i->next;
    381       if (i->op == OP_MOV || i->op == OP_CALL)
    382          continue;
    383 
    384       ImmediateValue src0, src1, src2;
    385 
    386       if (i->srcExists(2) &&
    387           i->src(0).getImmediate(src0) &&
    388           i->src(1).getImmediate(src1) &&
    389           i->src(2).getImmediate(src2))
    390          expr(i, src0, src1, src2);
    391       else
    392       if (i->srcExists(1) &&
    393           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
    394          expr(i, src0, src1);
    395       else
    396       if (i->srcExists(0) && i->src(0).getImmediate(src0))
    397          opnd(i, src0, 0);
    398       else
    399       if (i->srcExists(1) && i->src(1).getImmediate(src1))
    400          opnd(i, src1, 1);
    401       if (i->srcExists(2) && i->src(2).getImmediate(src2))
    402          opnd3(i, src2);
    403    }
    404    return true;
    405 }
    406 
    407 CmpInstruction *
    408 ConstantFolding::findOriginForTestWithZero(Value *value)
    409 {
    410    if (!value)
    411       return NULL;
    412    Instruction *insn = value->getInsn();
    413 
    414    if (insn->asCmp() && insn->op != OP_SLCT)
    415       return insn->asCmp();
    416 
    417    /* Sometimes mov's will sneak in as a result of other folding. This gets
    418     * cleaned up later.
    419     */
    420    if (insn->op == OP_MOV)
    421       return findOriginForTestWithZero(insn->getSrc(0));
    422 
    423    /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
    424    if (insn->op == OP_AND) {
    425       int s = 0;
    426       ImmediateValue imm;
    427       if (!insn->src(s).getImmediate(imm)) {
    428          s = 1;
    429          if (!insn->src(s).getImmediate(imm))
    430             return NULL;
    431       }
    432       if (imm.reg.data.f32 != 1.0f)
    433          return NULL;
    434       /* TODO: Come up with a way to handle the condition being inverted */
    435       if (insn->src(!s).mod != Modifier(0))
    436          return NULL;
    437       return findOriginForTestWithZero(insn->getSrc(!s));
    438    }
    439 
    440    return NULL;
    441 }
    442 
    443 void
    444 Modifier::applyTo(ImmediateValue& imm) const
    445 {
    446    if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
    447       return;
    448    switch (imm.reg.type) {
    449    case TYPE_F32:
    450       if (bits & NV50_IR_MOD_ABS)
    451          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
    452       if (bits & NV50_IR_MOD_NEG)
    453          imm.reg.data.f32 = -imm.reg.data.f32;
    454       if (bits & NV50_IR_MOD_SAT) {
    455          if (imm.reg.data.f32 < 0.0f)
    456             imm.reg.data.f32 = 0.0f;
    457          else
    458          if (imm.reg.data.f32 > 1.0f)
    459             imm.reg.data.f32 = 1.0f;
    460       }
    461       assert(!(bits & NV50_IR_MOD_NOT));
    462       break;
    463 
    464    case TYPE_S8: // NOTE: will be extended
    465    case TYPE_S16:
    466    case TYPE_S32:
    467    case TYPE_U8: // NOTE: treated as signed
    468    case TYPE_U16:
    469    case TYPE_U32:
    470       if (bits & NV50_IR_MOD_ABS)
    471          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
    472             imm.reg.data.s32 : -imm.reg.data.s32;
    473       if (bits & NV50_IR_MOD_NEG)
    474          imm.reg.data.s32 = -imm.reg.data.s32;
    475       if (bits & NV50_IR_MOD_NOT)
    476          imm.reg.data.s32 = ~imm.reg.data.s32;
    477       break;
    478 
    479    case TYPE_F64:
    480       if (bits & NV50_IR_MOD_ABS)
    481          imm.reg.data.f64 = fabs(imm.reg.data.f64);
    482       if (bits & NV50_IR_MOD_NEG)
    483          imm.reg.data.f64 = -imm.reg.data.f64;
    484       if (bits & NV50_IR_MOD_SAT) {
    485          if (imm.reg.data.f64 < 0.0)
    486             imm.reg.data.f64 = 0.0;
    487          else
    488          if (imm.reg.data.f64 > 1.0)
    489             imm.reg.data.f64 = 1.0;
    490       }
    491       assert(!(bits & NV50_IR_MOD_NOT));
    492       break;
    493 
    494    default:
    495       assert(!"invalid/unhandled type");
    496       imm.reg.data.u64 = 0;
    497       break;
    498    }
    499 }
    500 
    501 operation
    502 Modifier::getOp() const
    503 {
    504    switch (bits) {
    505    case NV50_IR_MOD_ABS: return OP_ABS;
    506    case NV50_IR_MOD_NEG: return OP_NEG;
    507    case NV50_IR_MOD_SAT: return OP_SAT;
    508    case NV50_IR_MOD_NOT: return OP_NOT;
    509    case 0:
    510       return OP_MOV;
    511    default:
    512       return OP_CVT;
    513    }
    514 }
    515 
    516 void
    517 ConstantFolding::expr(Instruction *i,
    518                       ImmediateValue &imm0, ImmediateValue &imm1)
    519 {
    520    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
    521    struct Storage res;
    522    DataType type = i->dType;
    523 
    524    memset(&res.data, 0, sizeof(res.data));
    525 
    526    switch (i->op) {
    527    case OP_MAD:
    528    case OP_FMA:
    529    case OP_MUL:
    530       if (i->dnz && i->dType == TYPE_F32) {
    531          if (!isfinite(a->data.f32))
    532             a->data.f32 = 0.0f;
    533          if (!isfinite(b->data.f32))
    534             b->data.f32 = 0.0f;
    535       }
    536       switch (i->dType) {
    537       case TYPE_F32:
    538          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
    539          break;
    540       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
    541       case TYPE_S32:
    542          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    543             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
    544             break;
    545          }
    546          /* fallthrough */
    547       case TYPE_U32:
    548          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    549             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
    550             break;
    551          }
    552          res.data.u32 = a->data.u32 * b->data.u32; break;
    553       default:
    554          return;
    555       }
    556       break;
    557    case OP_DIV:
    558       if (b->data.u32 == 0)
    559          break;
    560       switch (i->dType) {
    561       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
    562       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
    563       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
    564       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
    565       default:
    566          return;
    567       }
    568       break;
    569    case OP_ADD:
    570       switch (i->dType) {
    571       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
    572       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
    573       case TYPE_S32:
    574       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
    575       default:
    576          return;
    577       }
    578       break;
    579    case OP_SUB:
    580       switch (i->dType) {
    581       case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
    582       case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
    583       case TYPE_S32:
    584       case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
    585       default:
    586          return;
    587       }
    588       break;
    589    case OP_POW:
    590       switch (i->dType) {
    591       case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
    592       case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
    593       default:
    594          return;
    595       }
    596       break;
    597    case OP_MAX:
    598       switch (i->dType) {
    599       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
    600       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
    601       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
    602       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
    603       default:
    604          return;
    605       }
    606       break;
    607    case OP_MIN:
    608       switch (i->dType) {
    609       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
    610       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
    611       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
    612       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
    613       default:
    614          return;
    615       }
    616       break;
    617    case OP_AND:
    618       res.data.u64 = a->data.u64 & b->data.u64;
    619       break;
    620    case OP_OR:
    621       res.data.u64 = a->data.u64 | b->data.u64;
    622       break;
    623    case OP_XOR:
    624       res.data.u64 = a->data.u64 ^ b->data.u64;
    625       break;
    626    case OP_SHL:
    627       res.data.u32 = a->data.u32 << b->data.u32;
    628       break;
    629    case OP_SHR:
    630       switch (i->dType) {
    631       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
    632       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
    633       default:
    634          return;
    635       }
    636       break;
    637    case OP_SLCT:
    638       if (a->data.u32 != b->data.u32)
    639          return;
    640       res.data.u32 = a->data.u32;
    641       break;
    642    case OP_EXTBF: {
    643       int offset = b->data.u32 & 0xff;
    644       int width = (b->data.u32 >> 8) & 0xff;
    645       int rshift = offset;
    646       int lshift = 0;
    647       if (width == 0) {
    648          res.data.u32 = 0;
    649          break;
    650       }
    651       if (width + offset < 32) {
    652          rshift = 32 - width;
    653          lshift = 32 - width - offset;
    654       }
    655       if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
    656          res.data.u32 = util_bitreverse(a->data.u32);
    657       else
    658          res.data.u32 = a->data.u32;
    659       switch (i->dType) {
    660       case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
    661       case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
    662       default:
    663          return;
    664       }
    665       break;
    666    }
    667    case OP_POPCNT:
    668       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
    669       break;
    670    case OP_PFETCH:
    671       // The two arguments to pfetch are logically added together. Normally
    672       // the second argument will not be constant, but that can happen.
    673       res.data.u32 = a->data.u32 + b->data.u32;
    674       type = TYPE_U32;
    675       break;
    676    case OP_MERGE:
    677       switch (i->dType) {
    678       case TYPE_U64:
    679       case TYPE_S64:
    680       case TYPE_F64:
    681          res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
    682          break;
    683       default:
    684          return;
    685       }
    686       break;
    687    default:
    688       return;
    689    }
    690    ++foldCount;
    691 
    692    i->src(0).mod = Modifier(0);
    693    i->src(1).mod = Modifier(0);
    694    i->postFactor = 0;
    695 
    696    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
    697    i->setSrc(1, NULL);
    698 
    699    i->getSrc(0)->reg.data = res.data;
    700    i->getSrc(0)->reg.type = type;
    701    i->getSrc(0)->reg.size = typeSizeof(type);
    702 
    703    switch (i->op) {
    704    case OP_MAD:
    705    case OP_FMA: {
    706       ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
    707 
    708       // Move the immediate into position 1, where we know it might be
    709       // emittable. However it might not be anyways, as there may be other
    710       // restrictions, so move it into a separate LValue.
    711       bld.setPosition(i, false);
    712       i->op = OP_ADD;
    713       i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
    714       i->setSrc(0, i->getSrc(2));
    715       i->src(0).mod = i->src(2).mod;
    716       i->setSrc(2, NULL);
    717 
    718       if (i->src(0).getImmediate(src0))
    719          expr(i, src0, src1);
    720       else
    721          opnd(i, src1, 1);
    722       break;
    723    }
    724    case OP_PFETCH:
    725       // Leave PFETCH alone... we just folded its 2 args into 1.
    726       break;
    727    default:
    728       i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
    729       break;
    730    }
    731    i->subOp = 0;
    732 }
    733 
    734 void
    735 ConstantFolding::expr(Instruction *i,
    736                       ImmediateValue &imm0,
    737                       ImmediateValue &imm1,
    738                       ImmediateValue &imm2)
    739 {
    740    struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
    741    struct Storage res;
    742 
    743    memset(&res.data, 0, sizeof(res.data));
    744 
    745    switch (i->op) {
    746    case OP_INSBF: {
    747       int offset = b->data.u32 & 0xff;
    748       int width = (b->data.u32 >> 8) & 0xff;
    749       unsigned bitmask = ((1 << width) - 1) << offset;
    750       res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
    751       break;
    752    }
    753    case OP_MAD:
    754    case OP_FMA: {
    755       switch (i->dType) {
    756       case TYPE_F32:
    757          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
    758             c->data.f32;
    759          break;
    760       case TYPE_F64:
    761          res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
    762          break;
    763       case TYPE_S32:
    764          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    765             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
    766             break;
    767          }
    768          /* fallthrough */
    769       case TYPE_U32:
    770          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    771             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
    772             break;
    773          }
    774          res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
    775          break;
    776       default:
    777          return;
    778       }
    779       break;
    780    }
    781    case OP_SHLADD:
    782       res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
    783       break;
    784    default:
    785       return;
    786    }
    787 
    788    ++foldCount;
    789    i->src(0).mod = Modifier(0);
    790    i->src(1).mod = Modifier(0);
    791    i->src(2).mod = Modifier(0);
    792 
    793    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
    794    i->setSrc(1, NULL);
    795    i->setSrc(2, NULL);
    796 
    797    i->getSrc(0)->reg.data = res.data;
    798    i->getSrc(0)->reg.type = i->dType;
    799    i->getSrc(0)->reg.size = typeSizeof(i->dType);
    800 
    801    i->op = OP_MOV;
    802 }
    803 
    804 void
    805 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
    806 {
    807    Storage res;
    808 
    809    if (i->dType != TYPE_F32)
    810       return;
    811    switch (i->op) {
    812    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
    813    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
    814    case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
    815    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
    816    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
    817    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
    818    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
    819    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
    820    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
    821    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
    822    case OP_PRESIN:
    823    case OP_PREEX2:
    824       // these should be handled in subsequent OP_SIN/COS/EX2
    825       res.data.f32 = imm.reg.data.f32;
    826       break;
    827    default:
    828       return;
    829    }
    830    i->op = OP_MOV;
    831    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
    832    i->src(0).mod = Modifier(0);
    833 }
    834 
    835 void
    836 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
    837                                         const int s, ImmediateValue& imm2)
    838 {
    839    const int t = s ? 0 : 1;
    840    Instruction *insn;
    841    Instruction *mul1 = NULL; // mul1 before mul2
    842    int e = 0;
    843    float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
    844    ImmediateValue imm1;
    845 
    846    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
    847 
    848    if (mul2->getSrc(t)->refCount() == 1) {
    849       insn = mul2->getSrc(t)->getInsn();
    850       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
    851          mul1 = insn;
    852       if (mul1 && !mul1->saturate) {
    853          int s1;
    854 
    855          if (mul1->src(s1 = 0).getImmediate(imm1) ||
    856              mul1->src(s1 = 1).getImmediate(imm1)) {
    857             bld.setPosition(mul1, false);
    858             // a = mul r, imm1
    859             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
    860             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
    861             mul1->src(s1).mod = Modifier(0);
    862             mul2->def(0).replace(mul1->getDef(0), false);
    863             mul1->saturate = mul2->saturate;
    864          } else
    865          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
    866             // c = mul a, b
    867             // d = mul c, imm   -> d = mul_x_imm a, b
    868             mul1->postFactor = e;
    869             mul2->def(0).replace(mul1->getDef(0), false);
    870             if (f < 0)
    871                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
    872             mul1->saturate = mul2->saturate;
    873          }
    874          return;
    875       }
    876    }
    877    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
    878       // b = mul a, imm
    879       // d = mul b, c   -> d = mul_x_imm a, c
    880       int s2, t2;
    881       insn = (*mul2->getDef(0)->uses.begin())->getInsn();
    882       if (!insn)
    883          return;
    884       mul1 = mul2;
    885       mul2 = NULL;
    886       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
    887       t2 = s2 ? 0 : 1;
    888       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
    889          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
    890             mul2 = insn;
    891       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
    892          mul2->postFactor = e;
    893          mul2->setSrc(s2, mul1->src(t));
    894          if (f < 0)
    895             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
    896       }
    897    }
    898 }
    899 
    900 void
    901 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
    902 {
    903    switch (i->op) {
    904    case OP_MAD:
    905    case OP_FMA:
    906       if (imm2.isInteger(0)) {
    907          i->op = OP_MUL;
    908          i->setSrc(2, NULL);
    909          foldCount++;
    910          return;
    911       }
    912       break;
    913    case OP_SHLADD:
    914       if (imm2.isInteger(0)) {
    915          i->op = OP_SHL;
    916          i->setSrc(2, NULL);
    917          foldCount++;
    918          return;
    919       }
    920       break;
    921    default:
    922       return;
    923    }
    924 }
    925 
    926 void
    927 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
    928 {
    929    const Target *target = prog->getTarget();
    930    const int t = !s;
    931    const operation op = i->op;
    932    Instruction *newi = i;
    933 
    934    switch (i->op) {
    935    case OP_SPLIT: {
    936       bld.setPosition(i, false);
    937 
    938       uint8_t size = i->getDef(0)->reg.size;
    939       uint32_t mask = (1ULL << size) - 1;
    940       assert(size <= 32);
    941 
    942       uint64_t val = imm0.reg.data.u64;
    943       for (int8_t d = 0; i->defExists(d); ++d) {
    944          Value *def = i->getDef(d);
    945          assert(def->reg.size == size);
    946 
    947          newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32);
    948          val >>= size;
    949       }
    950       delete_Instruction(prog, i);
    951       break;
    952    }
    953    case OP_MUL:
    954       if (i->dType == TYPE_F32)
    955          tryCollapseChainedMULs(i, s, imm0);
    956 
    957       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
    958          assert(!isFloatType(i->sType));
    959          if (imm0.isInteger(1) && i->dType == TYPE_S32) {
    960             bld.setPosition(i, false);
    961             // Need to set to the sign value, which is a compare.
    962             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
    963                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
    964             delete_Instruction(prog, i);
    965          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
    966             // The high bits can't be set in this case (either mul by 0 or
    967             // unsigned by 1)
    968             i->op = OP_MOV;
    969             i->subOp = 0;
    970             i->setSrc(0, new_ImmediateValue(prog, 0u));
    971             i->src(0).mod = Modifier(0);
    972             i->setSrc(1, NULL);
    973          } else if (!imm0.isNegative() && imm0.isPow2()) {
    974             // Translate into a shift
    975             imm0.applyLog2();
    976             i->op = OP_SHR;
    977             i->subOp = 0;
    978             imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
    979             i->setSrc(0, i->getSrc(t));
    980             i->src(0).mod = i->src(t).mod;
    981             i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
    982             i->src(1).mod = 0;
    983          }
    984       } else
    985       if (imm0.isInteger(0)) {
    986          i->op = OP_MOV;
    987          i->setSrc(0, new_ImmediateValue(prog, 0u));
    988          i->src(0).mod = Modifier(0);
    989          i->postFactor = 0;
    990          i->setSrc(1, NULL);
    991       } else
    992       if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
    993          if (imm0.isNegative())
    994             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
    995          i->op = i->src(t).mod.getOp();
    996          if (s == 0) {
    997             i->setSrc(0, i->getSrc(1));
    998             i->src(0).mod = i->src(1).mod;
    999             i->src(1).mod = 0;
   1000          }
   1001          if (i->op != OP_CVT)
   1002             i->src(0).mod = 0;
   1003          i->setSrc(1, NULL);
   1004       } else
   1005       if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
   1006          if (imm0.isNegative())
   1007             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
   1008          i->op = OP_ADD;
   1009          i->setSrc(s, i->getSrc(t));
   1010          i->src(s).mod = i->src(t).mod;
   1011       } else
   1012       if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
   1013          i->op = OP_SHL;
   1014          imm0.applyLog2();
   1015          i->setSrc(0, i->getSrc(t));
   1016          i->src(0).mod = i->src(t).mod;
   1017          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
   1018          i->src(1).mod = 0;
   1019       } else
   1020       if (i->postFactor && i->sType == TYPE_F32) {
   1021          /* Can't emit a postfactor with an immediate, have to fold it in */
   1022          i->setSrc(s, new_ImmediateValue(
   1023                       prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
   1024          i->postFactor = 0;
   1025       }
   1026       break;
   1027    case OP_MAD:
   1028       if (imm0.isInteger(0)) {
   1029          i->setSrc(0, i->getSrc(2));
   1030          i->src(0).mod = i->src(2).mod;
   1031          i->setSrc(1, NULL);
   1032          i->setSrc(2, NULL);
   1033          i->op = i->src(0).mod.getOp();
   1034          if (i->op != OP_CVT)
   1035             i->src(0).mod = 0;
   1036       } else
   1037       if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
   1038           (imm0.isInteger(1) || imm0.isInteger(-1))) {
   1039          if (imm0.isNegative())
   1040             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
   1041          if (s == 0) {
   1042             i->setSrc(0, i->getSrc(1));
   1043             i->src(0).mod = i->src(1).mod;
   1044          }
   1045          i->setSrc(1, i->getSrc(2));
   1046          i->src(1).mod = i->src(2).mod;
   1047          i->setSrc(2, NULL);
   1048          i->op = OP_ADD;
   1049       } else
   1050       if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
   1051           target->isOpSupported(OP_SHLADD, i->dType)) {
   1052          i->op = OP_SHLADD;
   1053          imm0.applyLog2();
   1054          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
   1055       }
   1056       break;
   1057    case OP_ADD:
   1058    case OP_SUB:
   1059       if (i->usesFlags())
   1060          break;
   1061       if (imm0.isInteger(0)) {
   1062          if (s == 0) {
   1063             i->setSrc(0, i->getSrc(1));
   1064             i->src(0).mod = i->src(1).mod;
   1065             if (i->op == OP_SUB)
   1066                i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
   1067          }
   1068          i->setSrc(1, NULL);
   1069          i->op = i->src(0).mod.getOp();
   1070          if (i->op != OP_CVT)
   1071             i->src(0).mod = Modifier(0);
   1072       }
   1073       break;
   1074 
   1075    case OP_DIV:
   1076       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
   1077          break;
   1078       bld.setPosition(i, false);
   1079       if (imm0.reg.data.u32 == 0) {
   1080          break;
   1081       } else
   1082       if (imm0.reg.data.u32 == 1) {
   1083          i->op = OP_MOV;
   1084          i->setSrc(1, NULL);
   1085       } else
   1086       if (i->dType == TYPE_U32 && imm0.isPow2()) {
   1087          i->op = OP_SHR;
   1088          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
   1089       } else
   1090       if (i->dType == TYPE_U32) {
   1091          Instruction *mul;
   1092          Value *tA, *tB;
   1093          const uint32_t d = imm0.reg.data.u32;
   1094          uint32_t m;
   1095          int r, s;
   1096          uint32_t l = util_logbase2(d);
   1097          if (((uint32_t)1 << l) < d)
   1098             ++l;
   1099          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
   1100          r = l ? 1 : 0;
   1101          s = l ? (l - 1) : 0;
   1102 
   1103          tA = bld.getSSA();
   1104          tB = bld.getSSA();
   1105          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
   1106                          bld.loadImm(NULL, m));
   1107          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
   1108          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
   1109          tA = bld.getSSA();
   1110          if (r)
   1111             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
   1112          else
   1113             tA = tB;
   1114          tB = s ? bld.getSSA() : i->getDef(0);
   1115          newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
   1116          if (s)
   1117             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
   1118 
   1119          delete_Instruction(prog, i);
   1120       } else
   1121       if (imm0.reg.data.s32 == -1) {
   1122          i->op = OP_NEG;
   1123          i->setSrc(1, NULL);
   1124       } else {
   1125          LValue *tA, *tB;
   1126          LValue *tD;
   1127          const int32_t d = imm0.reg.data.s32;
   1128          int32_t m;
   1129          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
   1130          if ((1 << l) < abs(d))
   1131             ++l;
   1132          if (!l)
   1133             l = 1;
   1134          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
   1135 
   1136          tA = bld.getSSA();
   1137          tB = bld.getSSA();
   1138          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
   1139                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
   1140          if (l > 1)
   1141             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
   1142          else
   1143             tB = tA;
   1144          tA = bld.getSSA();
   1145          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
   1146          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
   1147          newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
   1148          if (d < 0)
   1149             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
   1150 
   1151          delete_Instruction(prog, i);
   1152       }
   1153       break;
   1154 
   1155    case OP_MOD:
   1156       if (i->sType == TYPE_U32 && imm0.isPow2()) {
   1157          bld.setPosition(i, false);
   1158          i->op = OP_AND;
   1159          i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
   1160       }
   1161       break;
   1162 
   1163    case OP_SET: // TODO: SET_AND,OR,XOR
   1164    {
   1165       /* This optimizes the case where the output of a set is being compared
   1166        * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
   1167        * can be a lot cleverer in our comparison.
   1168        */
   1169       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
   1170       CondCode cc, ccZ;
   1171       if (imm0.reg.data.u32 != 0 || !si)
   1172          return;
   1173       cc = si->setCond;
   1174       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
   1175       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
   1176       // first.
   1177       if (s == 0)
   1178          ccZ = reverseCondCode(ccZ);
   1179       // If there is a negative modifier, we need to undo that, by flipping
   1180       // the comparison to zero.
   1181       if (i->src(t).mod.neg())
   1182          ccZ = reverseCondCode(ccZ);
   1183       // If this is a signed comparison, we expect the input to be a regular
   1184       // boolean, i.e. 0/-1. However the rest of the logic assumes that true
   1185       // is positive, so just flip the sign.
   1186       if (i->sType == TYPE_S32) {
   1187          assert(!isFloatType(si->dType));
   1188          ccZ = reverseCondCode(ccZ);
   1189       }
   1190       switch (ccZ) {
   1191       case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
   1192       case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
   1193       case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
   1194       case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
   1195       case CC_GT: break; // bool > 0 -- bool
   1196       case CC_NE: break; // bool != 0 -- bool
   1197       default:
   1198          return;
   1199       }
   1200 
   1201       // Update the condition of this SET to be identical to the origin set,
   1202       // but with the updated condition code. The original SET should get
   1203       // DCE'd, ideally.
   1204       i->op = si->op;
   1205       i->asCmp()->setCond = cc;
   1206       i->setSrc(0, si->src(0));
   1207       i->setSrc(1, si->src(1));
   1208       if (si->srcExists(2))
   1209          i->setSrc(2, si->src(2));
   1210       i->sType = si->sType;
   1211    }
   1212       break;
   1213 
   1214    case OP_AND:
   1215    {
   1216       Instruction *src = i->getSrc(t)->getInsn();
   1217       ImmediateValue imm1;
   1218       if (imm0.reg.data.u32 == 0) {
   1219          i->op = OP_MOV;
   1220          i->setSrc(0, new_ImmediateValue(prog, 0u));
   1221          i->src(0).mod = Modifier(0);
   1222          i->setSrc(1, NULL);
   1223       } else if (imm0.reg.data.u32 == ~0U) {
   1224          i->op = i->src(t).mod.getOp();
   1225          if (t) {
   1226             i->setSrc(0, i->getSrc(t));
   1227             i->src(0).mod = i->src(t).mod;
   1228          }
   1229          i->setSrc(1, NULL);
   1230       } else if (src->asCmp()) {
   1231          CmpInstruction *cmp = src->asCmp();
   1232          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
   1233             return;
   1234          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
   1235             return;
   1236          if (imm0.reg.data.f32 != 1.0)
   1237             return;
   1238          if (cmp->dType != TYPE_U32)
   1239             return;
   1240 
   1241          cmp->dType = TYPE_F32;
   1242          if (i->src(t).mod != Modifier(0)) {
   1243             assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
   1244             i->src(t).mod = Modifier(0);
   1245             cmp->setCond = inverseCondCode(cmp->setCond);
   1246          }
   1247          i->op = OP_MOV;
   1248          i->setSrc(s, NULL);
   1249          if (t) {
   1250             i->setSrc(0, i->getSrc(t));
   1251             i->setSrc(t, NULL);
   1252          }
   1253       } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
   1254                  src->op == OP_SHR &&
   1255                  src->src(1).getImmediate(imm1) &&
   1256                  i->src(t).mod == Modifier(0) &&
   1257                  util_is_power_of_two(imm0.reg.data.u32 + 1)) {
   1258          // low byte = offset, high byte = width
   1259          uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
   1260          i->op = OP_EXTBF;
   1261          i->setSrc(0, src->getSrc(0));
   1262          i->setSrc(1, new_ImmediateValue(prog, ext));
   1263       } else if (src->op == OP_SHL &&
   1264                  src->src(1).getImmediate(imm1) &&
   1265                  i->src(t).mod == Modifier(0) &&
   1266                  util_is_power_of_two(~imm0.reg.data.u32 + 1) &&
   1267                  util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
   1268          i->op = OP_MOV;
   1269          i->setSrc(s, NULL);
   1270          if (t) {
   1271             i->setSrc(0, i->getSrc(t));
   1272             i->setSrc(t, NULL);
   1273          }
   1274       }
   1275    }
   1276       break;
   1277 
   1278    case OP_SHL:
   1279    {
   1280       if (s != 1 || i->src(0).mod != Modifier(0))
   1281          break;
   1282       // try to concatenate shifts
   1283       Instruction *si = i->getSrc(0)->getInsn();
   1284       if (!si)
   1285          break;
   1286       ImmediateValue imm1;
   1287       switch (si->op) {
   1288       case OP_SHL:
   1289          if (si->src(1).getImmediate(imm1)) {
   1290             bld.setPosition(i, false);
   1291             i->setSrc(0, si->getSrc(0));
   1292             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
   1293          }
   1294          break;
   1295       case OP_SHR:
   1296          if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
   1297             bld.setPosition(i, false);
   1298             i->op = OP_AND;
   1299             i->setSrc(0, si->getSrc(0));
   1300             i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
   1301          }
   1302          break;
   1303       case OP_MUL:
   1304          int muls;
   1305          if (isFloatType(si->dType))
   1306             return;
   1307          if (si->src(1).getImmediate(imm1))
   1308             muls = 1;
   1309          else if (si->src(0).getImmediate(imm1))
   1310             muls = 0;
   1311          else
   1312             return;
   1313 
   1314          bld.setPosition(i, false);
   1315          i->op = OP_MUL;
   1316          i->setSrc(0, si->getSrc(!muls));
   1317          i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
   1318          break;
   1319       case OP_SUB:
   1320       case OP_ADD:
   1321          int adds;
   1322          if (isFloatType(si->dType))
   1323             return;
   1324          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
   1325             adds = 0;
   1326          else if (si->src(1).getImmediate(imm1))
   1327             adds = 1;
   1328          else
   1329             return;
   1330          if (si->src(!adds).mod != Modifier(0))
   1331             return;
   1332          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
   1333 
   1334          // This is more operations, but if one of x, y is an immediate, then
   1335          // we can get a situation where (a) we can use ISCADD, or (b)
   1336          // propagate the add bit into an indirect load.
   1337          bld.setPosition(i, false);
   1338          i->op = si->op;
   1339          i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
   1340          i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
   1341                                      bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
   1342                                      si->getSrc(!adds),
   1343                                      bld.mkImm(imm0.reg.data.u32)));
   1344          break;
   1345       default:
   1346          return;
   1347       }
   1348    }
   1349       break;
   1350 
   1351    case OP_ABS:
   1352    case OP_NEG:
   1353    case OP_SAT:
   1354    case OP_LG2:
   1355    case OP_RCP:
   1356    case OP_SQRT:
   1357    case OP_RSQ:
   1358    case OP_PRESIN:
   1359    case OP_SIN:
   1360    case OP_COS:
   1361    case OP_PREEX2:
   1362    case OP_EX2:
   1363       unary(i, imm0);
   1364       break;
   1365    case OP_BFIND: {
   1366       int32_t res;
   1367       switch (i->dType) {
   1368       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
   1369       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
   1370       default:
   1371          return;
   1372       }
   1373       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
   1374          res = 31 - res;
   1375       bld.setPosition(i, false); /* make sure bld is init'ed */
   1376       i->setSrc(0, bld.mkImm(res));
   1377       i->setSrc(1, NULL);
   1378       i->op = OP_MOV;
   1379       i->subOp = 0;
   1380       break;
   1381    }
   1382    case OP_POPCNT: {
   1383       // Only deal with 1-arg POPCNT here
   1384       if (i->srcExists(1))
   1385          break;
   1386       uint32_t res = util_bitcount(imm0.reg.data.u32);
   1387       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
   1388       i->setSrc(1, NULL);
   1389       i->op = OP_MOV;
   1390       break;
   1391    }
   1392    case OP_CVT: {
   1393       Storage res;
   1394 
   1395       // TODO: handle 64-bit values properly
   1396       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
   1397          return;
   1398 
   1399       // TODO: handle single byte/word extractions
   1400       if (i->subOp)
   1401          return;
   1402 
   1403       bld.setPosition(i, true); /* make sure bld is init'ed */
   1404 
   1405 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
   1406    case type: \
   1407       switch (i->sType) { \
   1408       case TYPE_F64: \
   1409          res.data.dst = util_iround(i->saturate ? \
   1410                                     CLAMP(imm0.reg.data.f64, fmin, fmax) : \
   1411                                     imm0.reg.data.f64); \
   1412          break; \
   1413       case TYPE_F32: \
   1414          res.data.dst = util_iround(i->saturate ? \
   1415                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
   1416                                     imm0.reg.data.f32); \
   1417          break; \
   1418       case TYPE_S32: \
   1419          res.data.dst = i->saturate ? \
   1420                         CLAMP(imm0.reg.data.s32, imin, imax) : \
   1421                         imm0.reg.data.s32; \
   1422          break; \
   1423       case TYPE_U32: \
   1424          res.data.dst = i->saturate ? \
   1425                         CLAMP(imm0.reg.data.u32, umin, umax) : \
   1426                         imm0.reg.data.u32; \
   1427          break; \
   1428       case TYPE_S16: \
   1429          res.data.dst = i->saturate ? \
   1430                         CLAMP(imm0.reg.data.s16, imin, imax) : \
   1431                         imm0.reg.data.s16; \
   1432          break; \
   1433       case TYPE_U16: \
   1434          res.data.dst = i->saturate ? \
   1435                         CLAMP(imm0.reg.data.u16, umin, umax) : \
   1436                         imm0.reg.data.u16; \
   1437          break; \
   1438       default: return; \
   1439       } \
   1440       i->setSrc(0, bld.mkImm(res.data.dst)); \
   1441       break
   1442 
   1443       switch(i->dType) {
   1444       CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
   1445       CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
   1446       CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
   1447       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
   1448       case TYPE_F32:
   1449          switch (i->sType) {
   1450          case TYPE_F64:
   1451             res.data.f32 = i->saturate ?
   1452                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
   1453                imm0.reg.data.f64;
   1454             break;
   1455          case TYPE_F32:
   1456             res.data.f32 = i->saturate ?
   1457                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
   1458                imm0.reg.data.f32;
   1459             break;
   1460          case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
   1461          case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
   1462          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
   1463          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
   1464          default:
   1465             return;
   1466          }
   1467          i->setSrc(0, bld.mkImm(res.data.f32));
   1468          break;
   1469       case TYPE_F64:
   1470          switch (i->sType) {
   1471          case TYPE_F64:
   1472             res.data.f64 = i->saturate ?
   1473                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
   1474                imm0.reg.data.f64;
   1475             break;
   1476          case TYPE_F32:
   1477             res.data.f64 = i->saturate ?
   1478                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
   1479                imm0.reg.data.f32;
   1480             break;
   1481          case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
   1482          case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
   1483          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
   1484          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
   1485          default:
   1486             return;
   1487          }
   1488          i->setSrc(0, bld.mkImm(res.data.f64));
   1489          break;
   1490       default:
   1491          return;
   1492       }
   1493 #undef CASE
   1494 
   1495       i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
   1496       i->op = OP_MOV;
   1497       i->saturate = 0;
   1498       i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
   1499       break;
   1500    }
   1501    default:
   1502       return;
   1503    }
   1504    if (newi->op != op)
   1505       foldCount++;
   1506 }
   1507 
   1508 // =============================================================================
   1509 
   1510 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
   1511 class ModifierFolding : public Pass
   1512 {
   1513 private:
   1514    virtual bool visit(BasicBlock *);
   1515 };
   1516 
   1517 bool
   1518 ModifierFolding::visit(BasicBlock *bb)
   1519 {
   1520    const Target *target = prog->getTarget();
   1521 
   1522    Instruction *i, *next, *mi;
   1523    Modifier mod;
   1524 
   1525    for (i = bb->getEntry(); i; i = next) {
   1526       next = i->next;
   1527 
   1528       if (0 && i->op == OP_SUB) {
   1529          // turn "sub" into "add neg" (do we really want this ?)
   1530          i->op = OP_ADD;
   1531          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
   1532       }
   1533 
   1534       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
   1535          mi = i->getSrc(s)->getInsn();
   1536          if (!mi ||
   1537              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
   1538             continue;
   1539          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
   1540             if ((i->op != OP_ADD &&
   1541                  i->op != OP_MUL) ||
   1542                 (mi->op != OP_ABS &&
   1543                  mi->op != OP_NEG))
   1544                continue;
   1545          } else
   1546          if (i->sType != mi->dType) {
   1547             continue;
   1548          }
   1549          if ((mod = Modifier(mi->op)) == Modifier(0))
   1550             continue;
   1551          mod *= mi->src(0).mod;
   1552 
   1553          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
   1554             // abs neg [abs] = abs
   1555             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
   1556          } else
   1557          if ((i->op == OP_NEG) && mod.neg()) {
   1558             assert(s == 0);
   1559             // neg as both opcode and modifier on same insn is prohibited
   1560             // neg neg abs = abs, neg neg = identity
   1561             mod = mod & Modifier(~NV50_IR_MOD_NEG);
   1562             i->op = mod.getOp();
   1563             mod = mod & Modifier(~NV50_IR_MOD_ABS);
   1564             if (mod == Modifier(0))
   1565                i->op = OP_MOV;
   1566          }
   1567 
   1568          if (target->isModSupported(i, s, mod)) {
   1569             i->setSrc(s, mi->getSrc(0));
   1570             i->src(s).mod *= mod;
   1571          }
   1572       }
   1573 
   1574       if (i->op == OP_SAT) {
   1575          mi = i->getSrc(0)->getInsn();
   1576          if (mi &&
   1577              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
   1578             mi->saturate = 1;
   1579             mi->setDef(0, i->getDef(0));
   1580             delete_Instruction(prog, i);
   1581          }
   1582       }
   1583    }
   1584 
   1585    return true;
   1586 }
   1587 
   1588 // =============================================================================
   1589 
   1590 // MUL + ADD -> MAD/FMA
   1591 // MIN/MAX(a, a) -> a, etc.
   1592 // SLCT(a, b, const) -> cc(const) ? a : b
   1593 // RCP(RCP(a)) -> a
   1594 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
   1595 class AlgebraicOpt : public Pass
   1596 {
   1597 private:
   1598    virtual bool visit(BasicBlock *);
   1599 
   1600    void handleABS(Instruction *);
   1601    bool handleADD(Instruction *);
   1602    bool tryADDToMADOrSAD(Instruction *, operation toOp);
   1603    void handleMINMAX(Instruction *);
   1604    void handleRCP(Instruction *);
   1605    void handleSLCT(Instruction *);
   1606    void handleLOGOP(Instruction *);
   1607    void handleCVT_NEG(Instruction *);
   1608    void handleCVT_CVT(Instruction *);
   1609    void handleCVT_EXTBF(Instruction *);
   1610    void handleSUCLAMP(Instruction *);
   1611    void handleNEG(Instruction *);
   1612 
   1613    BuildUtil bld;
   1614 };
   1615 
   1616 void
   1617 AlgebraicOpt::handleABS(Instruction *abs)
   1618 {
   1619    Instruction *sub = abs->getSrc(0)->getInsn();
   1620    DataType ty;
   1621    if (!sub ||
   1622        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
   1623       return;
   1624    // expect not to have mods yet, if we do, bail
   1625    if (sub->src(0).mod || sub->src(1).mod)
   1626       return;
   1627    // hidden conversion ?
   1628    ty = intTypeToSigned(sub->dType);
   1629    if (abs->dType != abs->sType || ty != abs->sType)
   1630       return;
   1631 
   1632    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
   1633        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
   1634        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
   1635          return;
   1636 
   1637    Value *src0 = sub->getSrc(0);
   1638    Value *src1 = sub->getSrc(1);
   1639 
   1640    if (sub->op == OP_ADD) {
   1641       Instruction *neg = sub->getSrc(1)->getInsn();
   1642       if (neg && neg->op != OP_NEG) {
   1643          neg = sub->getSrc(0)->getInsn();
   1644          src0 = sub->getSrc(1);
   1645       }
   1646       if (!neg || neg->op != OP_NEG ||
   1647           neg->dType != neg->sType || neg->sType != ty)
   1648          return;
   1649       src1 = neg->getSrc(0);
   1650    }
   1651 
   1652    // found ABS(SUB))
   1653    abs->moveSources(1, 2); // move sources >=1 up by 2
   1654    abs->op = OP_SAD;
   1655    abs->setType(sub->dType);
   1656    abs->setSrc(0, src0);
   1657    abs->setSrc(1, src1);
   1658    bld.setPosition(abs, false);
   1659    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
   1660 }
   1661 
   1662 bool
   1663 AlgebraicOpt::handleADD(Instruction *add)
   1664 {
   1665    Value *src0 = add->getSrc(0);
   1666    Value *src1 = add->getSrc(1);
   1667 
   1668    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
   1669       return false;
   1670 
   1671    bool changed = false;
   1672    if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
   1673       changed = tryADDToMADOrSAD(add, OP_MAD);
   1674    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
   1675       changed = tryADDToMADOrSAD(add, OP_SAD);
   1676    return changed;
   1677 }
   1678 
   1679 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
   1680 // ADD(MUL(a,b), c) -> MAD(a,b,c)
   1681 bool
   1682 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
   1683 {
   1684    Value *src0 = add->getSrc(0);
   1685    Value *src1 = add->getSrc(1);
   1686    Value *src;
   1687    int s;
   1688    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
   1689    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
   1690    Modifier mod[4];
   1691 
   1692    if (src0->refCount() == 1 &&
   1693        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
   1694       s = 0;
   1695    else
   1696    if (src1->refCount() == 1 &&
   1697        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
   1698       s = 1;
   1699    else
   1700       return false;
   1701 
   1702    src = add->getSrc(s);
   1703 
   1704    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
   1705       return false;
   1706 
   1707    if (src->getInsn()->saturate || src->getInsn()->postFactor ||
   1708        src->getInsn()->dnz)
   1709       return false;
   1710 
   1711    if (toOp == OP_SAD) {
   1712       ImmediateValue imm;
   1713       if (!src->getInsn()->src(2).getImmediate(imm))
   1714          return false;
   1715       if (!imm.isInteger(0))
   1716          return false;
   1717    }
   1718 
   1719    if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
   1720        isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
   1721       return false;
   1722 
   1723    mod[0] = add->src(0).mod;
   1724    mod[1] = add->src(1).mod;
   1725    mod[2] = src->getUniqueInsn()->src(0).mod;
   1726    mod[3] = src->getUniqueInsn()->src(1).mod;
   1727 
   1728    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
   1729       return false;
   1730 
   1731    add->op = toOp;
   1732    add->subOp = src->getInsn()->subOp; // potentially mul-high
   1733    add->dType = src->getInsn()->dType; // sign matters for imad hi
   1734    add->sType = src->getInsn()->sType;
   1735 
   1736    add->setSrc(2, add->src(s ? 0 : 1));
   1737 
   1738    add->setSrc(0, src->getInsn()->getSrc(0));
   1739    add->src(0).mod = mod[2] ^ mod[s];
   1740    add->setSrc(1, src->getInsn()->getSrc(1));
   1741    add->src(1).mod = mod[3];
   1742 
   1743    return true;
   1744 }
   1745 
   1746 void
   1747 AlgebraicOpt::handleMINMAX(Instruction *minmax)
   1748 {
   1749    Value *src0 = minmax->getSrc(0);
   1750    Value *src1 = minmax->getSrc(1);
   1751 
   1752    if (src0 != src1 || src0->reg.file != FILE_GPR)
   1753       return;
   1754    if (minmax->src(0).mod == minmax->src(1).mod) {
   1755       if (minmax->def(0).mayReplace(minmax->src(0))) {
   1756          minmax->def(0).replace(minmax->src(0), false);
   1757          minmax->bb->remove(minmax);
   1758       } else {
   1759          minmax->op = OP_CVT;
   1760          minmax->setSrc(1, NULL);
   1761       }
   1762    } else {
   1763       // TODO:
   1764       // min(x, -x) = -abs(x)
   1765       // min(x, -abs(x)) = -abs(x)
   1766       // min(x, abs(x)) = x
   1767       // max(x, -abs(x)) = x
   1768       // max(x, abs(x)) = abs(x)
   1769       // max(x, -x) = abs(x)
   1770    }
   1771 }
   1772 
   1773 void
   1774 AlgebraicOpt::handleRCP(Instruction *rcp)
   1775 {
   1776    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
   1777 
   1778    if (si && si->op == OP_RCP) {
   1779       Modifier mod = rcp->src(0).mod * si->src(0).mod;
   1780       rcp->op = mod.getOp();
   1781       rcp->setSrc(0, si->getSrc(0));
   1782    }
   1783 }
   1784 
   1785 void
   1786 AlgebraicOpt::handleSLCT(Instruction *slct)
   1787 {
   1788    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
   1789       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
   1790          slct->setSrc(0, slct->getSrc(1));
   1791    } else
   1792    if (slct->getSrc(0) != slct->getSrc(1)) {
   1793       return;
   1794    }
   1795    slct->op = OP_MOV;
   1796    slct->setSrc(1, NULL);
   1797    slct->setSrc(2, NULL);
   1798 }
   1799 
   1800 void
   1801 AlgebraicOpt::handleLOGOP(Instruction *logop)
   1802 {
   1803    Value *src0 = logop->getSrc(0);
   1804    Value *src1 = logop->getSrc(1);
   1805 
   1806    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
   1807       return;
   1808 
   1809    if (src0 == src1) {
   1810       if ((logop->op == OP_AND || logop->op == OP_OR) &&
   1811           logop->def(0).mayReplace(logop->src(0))) {
   1812          logop->def(0).replace(logop->src(0), false);
   1813          delete_Instruction(prog, logop);
   1814       }
   1815    } else {
   1816       // try AND(SET, SET) -> SET_AND(SET)
   1817       Instruction *set0 = src0->getInsn();
   1818       Instruction *set1 = src1->getInsn();
   1819 
   1820       if (!set0 || set0->fixed || !set1 || set1->fixed)
   1821          return;
   1822       if (set1->op != OP_SET) {
   1823          Instruction *xchg = set0;
   1824          set0 = set1;
   1825          set1 = xchg;
   1826          if (set1->op != OP_SET)
   1827             return;
   1828       }
   1829       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
   1830                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
   1831       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
   1832          return;
   1833       if (set0->op != OP_SET &&
   1834           set0->op != OP_SET_AND &&
   1835           set0->op != OP_SET_OR &&
   1836           set0->op != OP_SET_XOR)
   1837          return;
   1838       if (set0->getDef(0)->refCount() > 1 &&
   1839           set1->getDef(0)->refCount() > 1)
   1840          return;
   1841       if (set0->getPredicate() || set1->getPredicate())
   1842          return;
   1843       // check that they don't source each other
   1844       for (int s = 0; s < 2; ++s)
   1845          if (set0->getSrc(s) == set1->getDef(0) ||
   1846              set1->getSrc(s) == set0->getDef(0))
   1847             return;
   1848 
   1849       set0 = cloneForward(func, set0);
   1850       set1 = cloneShallow(func, set1);
   1851       logop->bb->insertAfter(logop, set1);
   1852       logop->bb->insertAfter(logop, set0);
   1853 
   1854       set0->dType = TYPE_U8;
   1855       set0->getDef(0)->reg.file = FILE_PREDICATE;
   1856       set0->getDef(0)->reg.size = 1;
   1857       set1->setSrc(2, set0->getDef(0));
   1858       set1->op = redOp;
   1859       set1->setDef(0, logop->getDef(0));
   1860       delete_Instruction(prog, logop);
   1861    }
   1862 }
   1863 
   1864 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
   1865 // nv50:
   1866 //  F2I(NEG(I2F(ABS(SET))))
   1867 void
   1868 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
   1869 {
   1870    Instruction *insn = cvt->getSrc(0)->getInsn();
   1871    if (cvt->sType != TYPE_F32 ||
   1872        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
   1873       return;
   1874    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
   1875       return;
   1876    if (insn->src(0).mod != Modifier(0))
   1877       return;
   1878    insn = insn->getSrc(0)->getInsn();
   1879 
   1880    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
   1881    if (insn && insn->op == OP_CVT &&
   1882        insn->dType == TYPE_F32 &&
   1883        insn->sType == TYPE_S32) {
   1884       insn = insn->getSrc(0)->getInsn();
   1885       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
   1886           insn->src(0).mod)
   1887          return;
   1888       insn = insn->getSrc(0)->getInsn();
   1889       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
   1890          return;
   1891    } else
   1892    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
   1893       return;
   1894    }
   1895 
   1896    Instruction *bset = cloneShallow(func, insn);
   1897    bset->dType = TYPE_U32;
   1898    bset->setDef(0, cvt->getDef(0));
   1899    cvt->bb->insertAfter(cvt, bset);
   1900    delete_Instruction(prog, cvt);
   1901 }
   1902 
   1903 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
   1904 // does a type conversion, this becomes trickier as there might be range
   1905 // changes/etc. We could handle those in theory as long as the range was being
   1906 // reduced or kept the same.
   1907 void
   1908 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
   1909 {
   1910    Instruction *insn = cvt->getSrc(0)->getInsn();
   1911    RoundMode rnd = insn->rnd;
   1912 
   1913    if (insn->saturate ||
   1914        insn->subOp ||
   1915        insn->dType != insn->sType ||
   1916        insn->dType != cvt->sType)
   1917       return;
   1918 
   1919    switch (insn->op) {
   1920    case OP_CEIL:
   1921       rnd = ROUND_PI;
   1922       break;
   1923    case OP_FLOOR:
   1924       rnd = ROUND_MI;
   1925       break;
   1926    case OP_TRUNC:
   1927       rnd = ROUND_ZI;
   1928       break;
   1929    case OP_CVT:
   1930       break;
   1931    default:
   1932       return;
   1933    }
   1934 
   1935    if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
   1936       rnd = (RoundMode)(rnd & 3);
   1937 
   1938    cvt->rnd = rnd;
   1939    cvt->setSrc(0, insn->getSrc(0));
   1940    cvt->src(0).mod *= insn->src(0).mod;
   1941    cvt->sType = insn->sType;
   1942 }
   1943 
   1944 // Some shaders extract packed bytes out of words and convert them to
   1945 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
   1946 // nv50 for word sizes.
   1947 //
   1948 // CVT(EXTBF(x, byte/word))
   1949 // CVT(AND(bytemask, x))
   1950 // CVT(AND(bytemask, SHR(x, 8/16/24)))
   1951 // CVT(SHR(x, 16/24))
   1952 void
   1953 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
   1954 {
   1955    Instruction *insn = cvt->getSrc(0)->getInsn();
   1956    ImmediateValue imm;
   1957    Value *arg = NULL;
   1958    unsigned width, offset;
   1959    if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
   1960       return;
   1961    if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
   1962       width = (imm.reg.data.u32 >> 8) & 0xff;
   1963       offset = imm.reg.data.u32 & 0xff;
   1964       arg = insn->getSrc(0);
   1965 
   1966       if (width != 8 && width != 16)
   1967          return;
   1968       if (width == 8 && offset & 0x7)
   1969          return;
   1970       if (width == 16 && offset & 0xf)
   1971          return;
   1972    } else if (insn->op == OP_AND) {
   1973       int s;
   1974       if (insn->src(0).getImmediate(imm))
   1975          s = 0;
   1976       else if (insn->src(1).getImmediate(imm))
   1977          s = 1;
   1978       else
   1979          return;
   1980 
   1981       if (imm.reg.data.u32 == 0xff)
   1982          width = 8;
   1983       else if (imm.reg.data.u32 == 0xffff)
   1984          width = 16;
   1985       else
   1986          return;
   1987 
   1988       arg = insn->getSrc(!s);
   1989       Instruction *shift = arg->getInsn();
   1990       offset = 0;
   1991       if (shift && shift->op == OP_SHR &&
   1992           shift->sType == cvt->sType &&
   1993           shift->src(1).getImmediate(imm) &&
   1994           ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
   1995            (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
   1996          arg = shift->getSrc(0);
   1997          offset = imm.reg.data.u32;
   1998       }
   1999       // We just AND'd the high bits away, which means this is effectively an
   2000       // unsigned value.
   2001       cvt->sType = TYPE_U32;
   2002    } else if (insn->op == OP_SHR &&
   2003               insn->sType == cvt->sType &&
   2004               insn->src(1).getImmediate(imm)) {
   2005       arg = insn->getSrc(0);
   2006       if (imm.reg.data.u32 == 24) {
   2007          width = 8;
   2008          offset = 24;
   2009       } else if (imm.reg.data.u32 == 16) {
   2010          width = 16;
   2011          offset = 16;
   2012       } else {
   2013          return;
   2014       }
   2015    }
   2016 
   2017    if (!arg)
   2018       return;
   2019 
   2020    // Irrespective of what came earlier, we can undo a shift on the argument
   2021    // by adjusting the offset.
   2022    Instruction *shift = arg->getInsn();
   2023    if (shift && shift->op == OP_SHL &&
   2024        shift->src(1).getImmediate(imm) &&
   2025        ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
   2026         (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
   2027        imm.reg.data.u32 <= offset) {
   2028       arg = shift->getSrc(0);
   2029       offset -= imm.reg.data.u32;
   2030    }
   2031 
   2032    // The unpackSnorm lowering still leaves a few shifts behind, but it's too
   2033    // annoying to detect them.
   2034 
   2035    if (width == 8) {
   2036       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
   2037    } else {
   2038       assert(width == 16);
   2039       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
   2040    }
   2041    cvt->setSrc(0, arg);
   2042    cvt->subOp = offset >> 3;
   2043 }
   2044 
   2045 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
   2046 void
   2047 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
   2048 {
   2049    ImmediateValue imm;
   2050    int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
   2051    int s;
   2052    Instruction *add;
   2053 
   2054    assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
   2055 
   2056    // look for ADD (TODO: only count references by non-SUCLAMP)
   2057    if (insn->getSrc(0)->refCount() > 1)
   2058       return;
   2059    add = insn->getSrc(0)->getInsn();
   2060    if (!add || add->op != OP_ADD ||
   2061        (add->dType != TYPE_U32 &&
   2062         add->dType != TYPE_S32))
   2063       return;
   2064 
   2065    // look for immediate
   2066    for (s = 0; s < 2; ++s)
   2067       if (add->src(s).getImmediate(imm))
   2068          break;
   2069    if (s >= 2)
   2070       return;
   2071    s = s ? 0 : 1;
   2072    // determine if immediate fits
   2073    val += imm.reg.data.s32;
   2074    if (val > 31 || val < -32)
   2075       return;
   2076    // determine if other addend fits
   2077    if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
   2078       return;
   2079 
   2080    bld.setPosition(insn, false); // make sure bld is init'ed
   2081    // replace sources
   2082    insn->setSrc(2, bld.mkImm(val));
   2083    insn->setSrc(0, add->getSrc(s));
   2084 }
   2085 
   2086 // NEG(AND(SET, 1)) -> SET
   2087 void
   2088 AlgebraicOpt::handleNEG(Instruction *i) {
   2089    Instruction *src = i->getSrc(0)->getInsn();
   2090    ImmediateValue imm;
   2091    int b;
   2092 
   2093    if (isFloatType(i->sType) || !src || src->op != OP_AND)
   2094       return;
   2095 
   2096    if (src->src(0).getImmediate(imm))
   2097       b = 1;
   2098    else if (src->src(1).getImmediate(imm))
   2099       b = 0;
   2100    else
   2101       return;
   2102 
   2103    if (!imm.isInteger(1))
   2104       return;
   2105 
   2106    Instruction *set = src->getSrc(b)->getInsn();
   2107    if ((set->op == OP_SET || set->op == OP_SET_AND ||
   2108        set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
   2109        !isFloatType(set->dType)) {
   2110       i->def(0).replace(set->getDef(0), false);
   2111    }
   2112 }
   2113 
   2114 bool
   2115 AlgebraicOpt::visit(BasicBlock *bb)
   2116 {
   2117    Instruction *next;
   2118    for (Instruction *i = bb->getEntry(); i; i = next) {
   2119       next = i->next;
   2120       switch (i->op) {
   2121       case OP_ABS:
   2122          handleABS(i);
   2123          break;
   2124       case OP_ADD:
   2125          handleADD(i);
   2126          break;
   2127       case OP_RCP:
   2128          handleRCP(i);
   2129          break;
   2130       case OP_MIN:
   2131       case OP_MAX:
   2132          handleMINMAX(i);
   2133          break;
   2134       case OP_SLCT:
   2135          handleSLCT(i);
   2136          break;
   2137       case OP_AND:
   2138       case OP_OR:
   2139       case OP_XOR:
   2140          handleLOGOP(i);
   2141          break;
   2142       case OP_CVT:
   2143          handleCVT_NEG(i);
   2144          handleCVT_CVT(i);
   2145          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
   2146              handleCVT_EXTBF(i);
   2147          break;
   2148       case OP_SUCLAMP:
   2149          handleSUCLAMP(i);
   2150          break;
   2151       case OP_NEG:
   2152          handleNEG(i);
   2153          break;
   2154       default:
   2155          break;
   2156       }
   2157    }
   2158 
   2159    return true;
   2160 }
   2161 
   2162 // =============================================================================
   2163 
   2164 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
   2165 class LateAlgebraicOpt : public Pass
   2166 {
   2167 private:
   2168    virtual bool visit(Instruction *);
   2169 
   2170    void handleADD(Instruction *);
   2171    bool tryADDToSHLADD(Instruction *);
   2172 };
   2173 
   2174 void
   2175 LateAlgebraicOpt::handleADD(Instruction *add)
   2176 {
   2177    Value *src0 = add->getSrc(0);
   2178    Value *src1 = add->getSrc(1);
   2179 
   2180    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
   2181       return;
   2182 
   2183    if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
   2184       tryADDToSHLADD(add);
   2185 }
   2186 
   2187 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
   2188 bool
   2189 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
   2190 {
   2191    Value *src0 = add->getSrc(0);
   2192    Value *src1 = add->getSrc(1);
   2193    ImmediateValue imm;
   2194    Instruction *shl;
   2195    Value *src;
   2196    int s;
   2197 
   2198    if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
   2199        || isFloatType(add->dType))
   2200       return false;
   2201 
   2202    if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
   2203       s = 0;
   2204    else
   2205    if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
   2206       s = 1;
   2207    else
   2208       return false;
   2209 
   2210    src = add->getSrc(s);
   2211    shl = src->getUniqueInsn();
   2212 
   2213    if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
   2214       return false;
   2215 
   2216    if (!shl->src(1).getImmediate(imm))
   2217       return false;
   2218 
   2219    add->op = OP_SHLADD;
   2220    add->setSrc(2, add->src(!s));
   2221    // SHL can't have any modifiers, but the ADD source may have had
   2222    // one. Preserve it.
   2223    add->setSrc(0, shl->getSrc(0));
   2224    if (s == 1)
   2225       add->src(0).mod = add->src(1).mod;
   2226    add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
   2227    add->src(1).mod = Modifier(0);
   2228 
   2229    return true;
   2230 }
   2231 
   2232 bool
   2233 LateAlgebraicOpt::visit(Instruction *i)
   2234 {
   2235    switch (i->op) {
   2236    case OP_ADD:
   2237       handleADD(i);
   2238       break;
   2239    default:
   2240       break;
   2241    }
   2242 
   2243    return true;
   2244 }
   2245 
   2246 // =============================================================================
   2247 
   2248 static inline void
   2249 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
   2250 {
   2251    if (offset != ldst->getSrc(0)->reg.data.offset) {
   2252       if (ldst->getSrc(0)->refCount() > 1)
   2253          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
   2254       ldst->getSrc(0)->reg.data.offset = offset;
   2255    }
   2256 }
   2257 
   2258 // Combine loads and stores, forward stores to loads where possible.
   2259 class MemoryOpt : public Pass
   2260 {
   2261 private:
   2262    class Record
   2263    {
   2264    public:
   2265       Record *next;
   2266       Instruction *insn;
   2267       const Value *rel[2];
   2268       const Value *base;
   2269       int32_t offset;
   2270       int8_t fileIndex;
   2271       uint8_t size;
   2272       bool locked;
   2273       Record *prev;
   2274 
   2275       bool overlaps(const Instruction *ldst) const;
   2276 
   2277       inline void link(Record **);
   2278       inline void unlink(Record **);
   2279       inline void set(const Instruction *ldst);
   2280    };
   2281 
   2282 public:
   2283    MemoryOpt();
   2284 
   2285    Record *loads[DATA_FILE_COUNT];
   2286    Record *stores[DATA_FILE_COUNT];
   2287 
   2288    MemoryPool recordPool;
   2289 
   2290 private:
   2291    virtual bool visit(BasicBlock *);
   2292    bool runOpt(BasicBlock *);
   2293 
   2294    Record **getList(const Instruction *);
   2295 
   2296    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
   2297 
   2298    // merge @insn into load/store instruction from @rec
   2299    bool combineLd(Record *rec, Instruction *ld);
   2300    bool combineSt(Record *rec, Instruction *st);
   2301 
   2302    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
   2303    bool replaceLdFromSt(Instruction *ld, Record *stRec);
   2304    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
   2305 
   2306    void addRecord(Instruction *ldst);
   2307    void purgeRecords(Instruction *const st, DataFile);
   2308    void lockStores(Instruction *const ld);
   2309    void reset();
   2310 
   2311 private:
   2312    Record *prevRecord;
   2313 };
   2314 
   2315 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
   2316 {
   2317    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
   2318       loads[i] = NULL;
   2319       stores[i] = NULL;
   2320    }
   2321    prevRecord = NULL;
   2322 }
   2323 
   2324 void
   2325 MemoryOpt::reset()
   2326 {
   2327    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
   2328       Record *it, *next;
   2329       for (it = loads[i]; it; it = next) {
   2330          next = it->next;
   2331          recordPool.release(it);
   2332       }
   2333       loads[i] = NULL;
   2334       for (it = stores[i]; it; it = next) {
   2335          next = it->next;
   2336          recordPool.release(it);
   2337       }
   2338       stores[i] = NULL;
   2339    }
   2340 }
   2341 
   2342 bool
   2343 MemoryOpt::combineLd(Record *rec, Instruction *ld)
   2344 {
   2345    int32_t offRc = rec->offset;
   2346    int32_t offLd = ld->getSrc(0)->reg.data.offset;
   2347    int sizeRc = rec->size;
   2348    int sizeLd = typeSizeof(ld->dType);
   2349    int size = sizeRc + sizeLd;
   2350    int d, j;
   2351 
   2352    if (!prog->getTarget()->
   2353        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
   2354       return false;
   2355    // no unaligned loads
   2356    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
   2357        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
   2358       return false;
   2359    // for compute indirect loads are not guaranteed to be aligned
   2360    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
   2361       return false;
   2362 
   2363    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
   2364 
   2365    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
   2366 
   2367    if (offLd < offRc) {
   2368       int sz;
   2369       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
   2370       // d: nr of definitions in ld
   2371       // j: nr of definitions in rec->insn, move:
   2372       for (d = d + j - 1; j > 0; --j, --d)
   2373          rec->insn->setDef(d, rec->insn->getDef(j - 1));
   2374 
   2375       if (rec->insn->getSrc(0)->refCount() > 1)
   2376          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
   2377       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
   2378 
   2379       d = 0;
   2380    } else {
   2381       d = j;
   2382    }
   2383    // move definitions of @ld to @rec->insn
   2384    for (j = 0; sizeLd; ++j, ++d) {
   2385       sizeLd -= ld->getDef(j)->reg.size;
   2386       rec->insn->setDef(d, ld->getDef(j));
   2387    }
   2388 
   2389    rec->size = size;
   2390    rec->insn->getSrc(0)->reg.size = size;
   2391    rec->insn->setType(typeOfSize(size));
   2392 
   2393    delete_Instruction(prog, ld);
   2394 
   2395    return true;
   2396 }
   2397 
   2398 bool
   2399 MemoryOpt::combineSt(Record *rec, Instruction *st)
   2400 {
   2401    int32_t offRc = rec->offset;
   2402    int32_t offSt = st->getSrc(0)->reg.data.offset;
   2403    int sizeRc = rec->size;
   2404    int sizeSt = typeSizeof(st->dType);
   2405    int s = sizeSt / 4;
   2406    int size = sizeRc + sizeSt;
   2407    int j, k;
   2408    Value *src[4]; // no modifiers in ValueRef allowed for st
   2409    Value *extra[3];
   2410 
   2411    if (!prog->getTarget()->
   2412        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
   2413       return false;
   2414    // no unaligned stores
   2415    if (size == 8 && MIN2(offRc, offSt) & 0x7)
   2416       return false;
   2417    // for compute indirect stores are not guaranteed to be aligned
   2418    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
   2419       return false;
   2420 
   2421    st->takeExtraSources(0, extra); // save predicate and indirect address
   2422 
   2423    if (offRc < offSt) {
   2424       // save values from @st
   2425       for (s = 0; sizeSt; ++s) {
   2426          sizeSt -= st->getSrc(s + 1)->reg.size;
   2427          src[s] = st->getSrc(s + 1);
   2428       }
   2429       // set record's values as low sources of @st
   2430       for (j = 1; sizeRc; ++j) {
   2431          sizeRc -= rec->insn->getSrc(j)->reg.size;
   2432          st->setSrc(j, rec->insn->getSrc(j));
   2433       }
   2434       // set saved values as high sources of @st
   2435       for (k = j, j = 0; j < s; ++j)
   2436          st->setSrc(k++, src[j]);
   2437 
   2438       updateLdStOffset(st, offRc, func);
   2439    } else {
   2440       for (j = 1; sizeSt; ++j)
   2441          sizeSt -= st->getSrc(j)->reg.size;
   2442       for (s = 1; sizeRc; ++j, ++s) {
   2443          sizeRc -= rec->insn->getSrc(s)->reg.size;
   2444          st->setSrc(j, rec->insn->getSrc(s));
   2445       }
   2446       rec->offset = offSt;
   2447    }
   2448    st->putExtraSources(0, extra); // restore pointer and predicate
   2449 
   2450    delete_Instruction(prog, rec->insn);
   2451    rec->insn = st;
   2452    rec->size = size;
   2453    rec->insn->getSrc(0)->reg.size = size;
   2454    rec->insn->setType(typeOfSize(size));
   2455    return true;
   2456 }
   2457 
   2458 void
   2459 MemoryOpt::Record::set(const Instruction *ldst)
   2460 {
   2461    const Symbol *mem = ldst->getSrc(0)->asSym();
   2462    fileIndex = mem->reg.fileIndex;
   2463    rel[0] = ldst->getIndirect(0, 0);
   2464    rel[1] = ldst->getIndirect(0, 1);
   2465    offset = mem->reg.data.offset;
   2466    base = mem->getBase();
   2467    size = typeSizeof(ldst->sType);
   2468 }
   2469 
   2470 void
   2471 MemoryOpt::Record::link(Record **list)
   2472 {
   2473    next = *list;
   2474    if (next)
   2475       next->prev = this;
   2476    prev = NULL;
   2477    *list = this;
   2478 }
   2479 
   2480 void
   2481 MemoryOpt::Record::unlink(Record **list)
   2482 {
   2483    if (next)
   2484       next->prev = prev;
   2485    if (prev)
   2486       prev->next = next;
   2487    else
   2488       *list = next;
   2489 }
   2490 
   2491 MemoryOpt::Record **
   2492 MemoryOpt::getList(const Instruction *insn)
   2493 {
   2494    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
   2495       return &loads[insn->src(0).getFile()];
   2496    return &stores[insn->src(0).getFile()];
   2497 }
   2498 
   2499 void
   2500 MemoryOpt::addRecord(Instruction *i)
   2501 {
   2502    Record **list = getList(i);
   2503    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
   2504 
   2505    it->link(list);
   2506    it->set(i);
   2507    it->insn = i;
   2508    it->locked = false;
   2509 }
   2510 
   2511 MemoryOpt::Record *
   2512 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
   2513 {
   2514    const Symbol *sym = insn->getSrc(0)->asSym();
   2515    const int size = typeSizeof(insn->sType);
   2516    Record *rec = NULL;
   2517    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
   2518 
   2519    for (; it; it = it->next) {
   2520       if (it->locked && insn->op != OP_LOAD)
   2521          continue;
   2522       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
   2523           it->rel[0] != insn->getIndirect(0, 0) ||
   2524           it->fileIndex != sym->reg.fileIndex ||
   2525           it->rel[1] != insn->getIndirect(0, 1))
   2526          continue;
   2527 
   2528       if (it->offset < sym->reg.data.offset) {
   2529          if (it->offset + it->size >= sym->reg.data.offset) {
   2530             isAdj = (it->offset + it->size == sym->reg.data.offset);
   2531             if (!isAdj)
   2532                return it;
   2533             if (!(it->offset & 0x7))
   2534                rec = it;
   2535          }
   2536       } else {
   2537          isAdj = it->offset != sym->reg.data.offset;
   2538          if (size <= it->size && !isAdj)
   2539             return it;
   2540          else
   2541          if (!(sym->reg.data.offset & 0x7))
   2542             if (it->offset - size <= sym->reg.data.offset)
   2543                rec = it;
   2544       }
   2545    }
   2546    return rec;
   2547 }
   2548 
   2549 bool
   2550 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
   2551 {
   2552    Instruction *st = rec->insn;
   2553    int32_t offSt = rec->offset;
   2554    int32_t offLd = ld->getSrc(0)->reg.data.offset;
   2555    int d, s;
   2556 
   2557    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
   2558       offSt += st->getSrc(s)->reg.size;
   2559    if (offSt != offLd)
   2560       return false;
   2561 
   2562    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
   2563       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
   2564          return false;
   2565       if (st->getSrc(s)->reg.file != FILE_GPR)
   2566          return false;
   2567       ld->def(d).replace(st->src(s), false);
   2568    }
   2569    ld->bb->remove(ld);
   2570    return true;
   2571 }
   2572 
   2573 bool
   2574 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
   2575 {
   2576    Instruction *ldR = rec->insn;
   2577    int32_t offR = rec->offset;
   2578    int32_t offE = ldE->getSrc(0)->reg.data.offset;
   2579    int dR, dE;
   2580 
   2581    assert(offR <= offE);
   2582    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
   2583       offR += ldR->getDef(dR)->reg.size;
   2584    if (offR != offE)
   2585       return false;
   2586 
   2587    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
   2588       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
   2589          return false;
   2590       ldE->def(dE).replace(ldR->getDef(dR), false);
   2591    }
   2592 
   2593    delete_Instruction(prog, ldE);
   2594    return true;
   2595 }
   2596 
   2597 bool
   2598 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
   2599 {
   2600    const Instruction *const ri = rec->insn;
   2601    Value *extra[3];
   2602 
   2603    int32_t offS = st->getSrc(0)->reg.data.offset;
   2604    int32_t offR = rec->offset;
   2605    int32_t endS = offS + typeSizeof(st->dType);
   2606    int32_t endR = offR + typeSizeof(ri->dType);
   2607 
   2608    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
   2609 
   2610    st->takeExtraSources(0, extra);
   2611 
   2612    if (offR < offS) {
   2613       Value *vals[10];
   2614       int s, n;
   2615       int k = 0;
   2616       // get non-replaced sources of ri
   2617       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
   2618          vals[k++] = ri->getSrc(s);
   2619       n = s;
   2620       // get replaced sources of st
   2621       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
   2622          vals[k++] = st->getSrc(s);
   2623       // skip replaced sources of ri
   2624       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
   2625       // get non-replaced sources after values covered by st
   2626       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
   2627          vals[k++] = ri->getSrc(s);
   2628       assert((unsigned int)k <= ARRAY_SIZE(vals));
   2629       for (s = 0; s < k; ++s)
   2630          st->setSrc(s + 1, vals[s]);
   2631       st->setSrc(0, ri->getSrc(0));
   2632    } else
   2633    if (endR > endS) {
   2634       int j, s;
   2635       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
   2636       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
   2637       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
   2638          st->setSrc(s++, ri->getSrc(j));
   2639    }
   2640    st->putExtraSources(0, extra);
   2641 
   2642    delete_Instruction(prog, rec->insn);
   2643 
   2644    rec->insn = st;
   2645    rec->offset = st->getSrc(0)->reg.data.offset;
   2646 
   2647    st->setType(typeOfSize(rec->size));
   2648 
   2649    return true;
   2650 }
   2651 
   2652 bool
   2653 MemoryOpt::Record::overlaps(const Instruction *ldst) const
   2654 {
   2655    Record that;
   2656    that.set(ldst);
   2657 
   2658    if (this->fileIndex != that.fileIndex)
   2659       return false;
   2660 
   2661    if (this->rel[0] || that.rel[0])
   2662       return this->base == that.base;
   2663    return
   2664       (this->offset < that.offset + that.size) &&
   2665       (this->offset + this->size > that.offset);
   2666 }
   2667 
   2668 // We must not eliminate stores that affect the result of @ld if
   2669 // we find later stores to the same location, and we may no longer
   2670 // merge them with later stores.
   2671 // The stored value can, however, still be used to determine the value
   2672 // returned by future loads.
   2673 void
   2674 MemoryOpt::lockStores(Instruction *const ld)
   2675 {
   2676    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
   2677       if (!r->locked && r->overlaps(ld))
   2678          r->locked = true;
   2679 }
   2680 
   2681 // Prior loads from the location of @st are no longer valid.
   2682 // Stores to the location of @st may no longer be used to derive
   2683 // the value at it nor be coalesced into later stores.
   2684 void
   2685 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
   2686 {
   2687    if (st)
   2688       f = st->src(0).getFile();
   2689 
   2690    for (Record *r = loads[f]; r; r = r->next)
   2691       if (!st || r->overlaps(st))
   2692          r->unlink(&loads[f]);
   2693 
   2694    for (Record *r = stores[f]; r; r = r->next)
   2695       if (!st || r->overlaps(st))
   2696          r->unlink(&stores[f]);
   2697 }
   2698 
   2699 bool
   2700 MemoryOpt::visit(BasicBlock *bb)
   2701 {
   2702    bool ret = runOpt(bb);
   2703    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
   2704    // where 96 bit memory operations are forbidden.
   2705    if (ret)
   2706       ret = runOpt(bb);
   2707    return ret;
   2708 }
   2709 
   2710 bool
   2711 MemoryOpt::runOpt(BasicBlock *bb)
   2712 {
   2713    Instruction *ldst, *next;
   2714    Record *rec;
   2715    bool isAdjacent = true;
   2716 
   2717    for (ldst = bb->getEntry(); ldst; ldst = next) {
   2718       bool keep = true;
   2719       bool isLoad = true;
   2720       next = ldst->next;
   2721 
   2722       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
   2723          if (ldst->isDead()) {
   2724             // might have been produced by earlier optimization
   2725             delete_Instruction(prog, ldst);
   2726             continue;
   2727          }
   2728       } else
   2729       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
   2730          if (typeSizeof(ldst->dType) == 4 &&
   2731              ldst->src(1).getFile() == FILE_GPR &&
   2732              ldst->getSrc(1)->getInsn()->op == OP_NOP) {
   2733             delete_Instruction(prog, ldst);
   2734             continue;
   2735          }
   2736          isLoad = false;
   2737       } else {
   2738          // TODO: maybe have all fixed ops act as barrier ?
   2739          if (ldst->op == OP_CALL ||
   2740              ldst->op == OP_BAR ||
   2741              ldst->op == OP_MEMBAR) {
   2742             purgeRecords(NULL, FILE_MEMORY_LOCAL);
   2743             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
   2744             purgeRecords(NULL, FILE_MEMORY_SHARED);
   2745             purgeRecords(NULL, FILE_SHADER_OUTPUT);
   2746          } else
   2747          if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
   2748             if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
   2749                purgeRecords(NULL, FILE_MEMORY_LOCAL);
   2750                purgeRecords(NULL, FILE_MEMORY_GLOBAL);
   2751                purgeRecords(NULL, FILE_MEMORY_SHARED);
   2752             } else {
   2753                purgeRecords(NULL, ldst->src(0).getFile());
   2754             }
   2755          } else
   2756          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
   2757             purgeRecords(NULL, FILE_SHADER_OUTPUT);
   2758          }
   2759          continue;
   2760       }
   2761       if (ldst->getPredicate()) // TODO: handle predicated ld/st
   2762          continue;
   2763       if (ldst->perPatch) // TODO: create separate per-patch lists
   2764          continue;
   2765 
   2766       if (isLoad) {
   2767          DataFile file = ldst->src(0).getFile();
   2768 
   2769          // if ld l[]/g[] look for previous store to eliminate the reload
   2770          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
   2771             // TODO: shared memory ?
   2772             rec = findRecord(ldst, false, isAdjacent);
   2773             if (rec && !isAdjacent)
   2774                keep = !replaceLdFromSt(ldst, rec);
   2775          }
   2776 
   2777          // or look for ld from the same location and replace this one
   2778          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
   2779          if (rec) {
   2780             if (!isAdjacent)
   2781                keep = !replaceLdFromLd(ldst, rec);
   2782             else
   2783                // or combine a previous load with this one
   2784                keep = !combineLd(rec, ldst);
   2785          }
   2786          if (keep)
   2787             lockStores(ldst);
   2788       } else {
   2789          rec = findRecord(ldst, false, isAdjacent);
   2790          if (rec) {
   2791             if (!isAdjacent)
   2792                keep = !replaceStFromSt(ldst, rec);
   2793             else
   2794                keep = !combineSt(rec, ldst);
   2795          }
   2796          if (keep)
   2797             purgeRecords(ldst, DATA_FILE_COUNT);
   2798       }
   2799       if (keep)
   2800          addRecord(ldst);
   2801    }
   2802    reset();
   2803 
   2804    return true;
   2805 }
   2806 
   2807 // =============================================================================
   2808 
   2809 // Turn control flow into predicated instructions (after register allocation !).
   2810 // TODO:
   2811 // Could move this to before register allocation on NVC0 and also handle nested
   2812 // constructs.
   2813 class FlatteningPass : public Pass
   2814 {
   2815 private:
   2816    virtual bool visit(Function *);
   2817    virtual bool visit(BasicBlock *);
   2818 
   2819    bool tryPredicateConditional(BasicBlock *);
   2820    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
   2821    void tryPropagateBranch(BasicBlock *);
   2822    inline bool isConstantCondition(Value *pred);
   2823    inline bool mayPredicate(const Instruction *, const Value *pred) const;
   2824    inline void removeFlow(Instruction *);
   2825 
   2826    uint8_t gpr_unit;
   2827 };
   2828 
   2829 bool
   2830 FlatteningPass::isConstantCondition(Value *pred)
   2831 {
   2832    Instruction *insn = pred->getUniqueInsn();
   2833    assert(insn);
   2834    if (insn->op != OP_SET || insn->srcExists(2))
   2835       return false;
   2836 
   2837    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
   2838       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
   2839       DataFile file;
   2840       if (ld) {
   2841          if (ld->op != OP_MOV && ld->op != OP_LOAD)
   2842             return false;
   2843          if (ld->src(0).isIndirect(0))
   2844             return false;
   2845          file = ld->src(0).getFile();
   2846       } else {
   2847          file = insn->src(s).getFile();
   2848          // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
   2849          // in register "units", which can vary between targets.
   2850          if (file == FILE_GPR) {
   2851             Value *v = insn->getSrc(s);
   2852             int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
   2853             int units = bytes >> gpr_unit;
   2854             if (units > prog->maxGPR)
   2855                file = FILE_IMMEDIATE;
   2856          }
   2857       }
   2858       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
   2859          return false;
   2860    }
   2861    return true;
   2862 }
   2863 
   2864 void
   2865 FlatteningPass::removeFlow(Instruction *insn)
   2866 {
   2867    FlowInstruction *term = insn ? insn->asFlow() : NULL;
   2868    if (!term)
   2869       return;
   2870    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
   2871 
   2872    if (term->op == OP_BRA) {
   2873       // TODO: this might get more difficult when we get arbitrary BRAs
   2874       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
   2875          return;
   2876    } else
   2877    if (term->op != OP_JOIN)
   2878       return;
   2879 
   2880    Value *pred = term->getPredicate();
   2881 
   2882    delete_Instruction(prog, term);
   2883 
   2884    if (pred && pred->refCount() == 0) {
   2885       Instruction *pSet = pred->getUniqueInsn();
   2886       pred->join->reg.data.id = -1; // deallocate
   2887       if (pSet->isDead())
   2888          delete_Instruction(prog, pSet);
   2889    }
   2890 }
   2891 
   2892 void
   2893 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
   2894 {
   2895    for (Instruction *i = bb->getEntry(); i; i = i->next) {
   2896       if (i->isNop())
   2897          continue;
   2898       assert(!i->getPredicate());
   2899       i->setPredicate(cc, pred);
   2900    }
   2901    removeFlow(bb->getExit());
   2902 }
   2903 
   2904 bool
   2905 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
   2906 {
   2907    if (insn->isPseudo())
   2908       return true;
   2909    // TODO: calls where we don't know which registers are modified
   2910 
   2911    if (!prog->getTarget()->mayPredicate(insn, pred))
   2912       return false;
   2913    for (int d = 0; insn->defExists(d); ++d)
   2914       if (insn->getDef(d)->equals(pred))
   2915          return false;
   2916    return true;
   2917 }
   2918 
   2919 // If we jump to BRA/RET/EXIT, replace the jump with it.
   2920 // NOTE: We do not update the CFG anymore here !
   2921 //
   2922 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
   2923 //  BB:0
   2924 //   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
   2925 //  BB1:
   2926 //   bra BB:3
   2927 //  BB2:
   2928 //   ...
   2929 //  BB3:
   2930 //   ...
   2931 void
   2932 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
   2933 {
   2934    for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
   2935       BasicBlock *bf = i->asFlow()->target.bb;
   2936 
   2937       if (bf->getInsnCount() != 1)
   2938          continue;
   2939 
   2940       FlowInstruction *bra = i->asFlow();
   2941       FlowInstruction *rep = bf->getExit()->asFlow();
   2942 
   2943       if (!rep || rep->getPredicate())
   2944          continue;
   2945       if (rep->op != OP_BRA &&
   2946           rep->op != OP_JOIN &&
   2947           rep->op != OP_EXIT)
   2948          continue;
   2949 
   2950       // TODO: If there are multiple branches to @rep, only the first would
   2951       // be replaced, so only remove them after this pass is done ?
   2952       // Also, need to check all incident blocks for fall-through exits and
   2953       // add the branch there.
   2954       bra->op = rep->op;
   2955       bra->target.bb = rep->target.bb;
   2956       if (bf->cfg.incidentCount() == 1)
   2957          bf->remove(rep);
   2958    }
   2959 }
   2960 
   2961 bool
   2962 FlatteningPass::visit(Function *fn)
   2963 {
   2964    gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
   2965 
   2966    return true;
   2967 }
   2968 
   2969 bool
   2970 FlatteningPass::visit(BasicBlock *bb)
   2971 {
   2972    if (tryPredicateConditional(bb))
   2973       return true;
   2974 
   2975    // try to attach join to previous instruction
   2976    if (prog->getTarget()->hasJoin) {
   2977       Instruction *insn = bb->getExit();
   2978       if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
   2979          insn = insn->prev;
   2980          if (insn && !insn->getPredicate() &&
   2981              !insn->asFlow() &&
   2982              insn->op != OP_DISCARD &&
   2983              insn->op != OP_TEXBAR &&
   2984              !isTextureOp(insn->op) && // probably just nve4
   2985              !isSurfaceOp(insn->op) && // not confirmed
   2986              insn->op != OP_LINTERP && // probably just nve4
   2987              insn->op != OP_PINTERP && // probably just nve4
   2988              ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
   2989               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
   2990              !insn->isNop()) {
   2991             insn->join = 1;
   2992             bb->remove(bb->getExit());
   2993             return true;
   2994          }
   2995       }
   2996    }
   2997 
   2998    tryPropagateBranch(bb);
   2999 
   3000    return true;
   3001 }
   3002 
   3003 bool
   3004 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
   3005 {
   3006    BasicBlock *bL = NULL, *bR = NULL;
   3007    unsigned int nL = 0, nR = 0, limit = 12;
   3008    Instruction *insn;
   3009    unsigned int mask;
   3010 
   3011    mask = bb->initiatesSimpleConditional();
   3012    if (!mask)
   3013       return false;
   3014 
   3015    assert(bb->getExit());
   3016    Value *pred = bb->getExit()->getPredicate();
   3017    assert(pred);
   3018 
   3019    if (isConstantCondition(pred))
   3020       limit = 4;
   3021 
   3022    Graph::EdgeIterator ei = bb->cfg.outgoing();
   3023 
   3024    if (mask & 1) {
   3025       bL = BasicBlock::get(ei.getNode());
   3026       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
   3027          if (!mayPredicate(insn, pred))
   3028             return false;
   3029       if (nL > limit)
   3030          return false; // too long, do a real branch
   3031    }
   3032    ei.next();
   3033 
   3034    if (mask & 2) {
   3035       bR = BasicBlock::get(ei.getNode());
   3036       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
   3037          if (!mayPredicate(insn, pred))
   3038             return false;
   3039       if (nR > limit)
   3040          return false; // too long, do a real branch
   3041    }
   3042 
   3043    if (bL)
   3044       predicateInstructions(bL, pred, bb->getExit()->cc);
   3045    if (bR)
   3046       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
   3047 
   3048    if (bb->joinAt) {
   3049       bb->remove(bb->joinAt);
   3050       bb->joinAt = NULL;
   3051    }
   3052    removeFlow(bb->getExit()); // delete the branch/join at the fork point
   3053 
   3054    // remove potential join operations at the end of the conditional
   3055    if (prog->getTarget()->joinAnterior) {
   3056       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
   3057       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
   3058          removeFlow(bb->getEntry());
   3059    }
   3060 
   3061    return true;
   3062 }
   3063 
   3064 // =============================================================================
   3065 
   3066 // Fold Immediate into MAD; must be done after register allocation due to
   3067 // constraint SDST == SSRC2
   3068 // TODO:
   3069 // Does NVC0+ have other situations where this pass makes sense?
   3070 class NV50PostRaConstantFolding : public Pass
   3071 {
   3072 private:
   3073    virtual bool visit(BasicBlock *);
   3074 };
   3075 
   3076 static bool
   3077 post_ra_dead(Instruction *i)
   3078 {
   3079    for (int d = 0; i->defExists(d); ++d)
   3080       if (i->getDef(d)->refCount())
   3081          return false;
   3082    return true;
   3083 }
   3084 
   3085 bool
   3086 NV50PostRaConstantFolding::visit(BasicBlock *bb)
   3087 {
   3088    Value *vtmp;
   3089    Instruction *def;
   3090 
   3091    for (Instruction *i = bb->getFirst(); i; i = i->next) {
   3092       switch (i->op) {
   3093       case OP_MAD:
   3094          if (i->def(0).getFile() != FILE_GPR ||
   3095              i->src(0).getFile() != FILE_GPR ||
   3096              i->src(1).getFile() != FILE_GPR ||
   3097              i->src(2).getFile() != FILE_GPR ||
   3098              i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
   3099             break;
   3100 
   3101          if (i->getDef(0)->reg.data.id >= 64 ||
   3102              i->getSrc(0)->reg.data.id >= 64)
   3103             break;
   3104 
   3105          if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
   3106             break;
   3107 
   3108          if (i->getPredicate())
   3109             break;
   3110 
   3111          def = i->getSrc(1)->getInsn();
   3112          if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
   3113             def = def->getSrc(0)->getInsn();
   3114          if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
   3115             vtmp = i->getSrc(1);
   3116             if (isFloatType(i->sType)) {
   3117                i->setSrc(1, def->getSrc(0));
   3118             } else {
   3119                ImmediateValue val;
   3120                bool ret = def->src(0).getImmediate(val);
   3121                assert(ret);
   3122                if (i->getSrc(1)->reg.data.id & 1)
   3123                   val.reg.data.u32 >>= 16;
   3124                val.reg.data.u32 &= 0xffff;
   3125                i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
   3126             }
   3127 
   3128             /* There's no post-RA dead code elimination, so do it here
   3129              * XXX: if we add more code-removing post-RA passes, we might
   3130              *      want to create a post-RA dead-code elim pass */
   3131             if (post_ra_dead(vtmp->getInsn())) {
   3132                Value *src = vtmp->getInsn()->getSrc(0);
   3133                // Careful -- splits will have already been removed from the
   3134                // functions. Don't double-delete.
   3135                if (vtmp->getInsn()->bb)
   3136                   delete_Instruction(prog, vtmp->getInsn());
   3137                if (src->getInsn() && post_ra_dead(src->getInsn()))
   3138                   delete_Instruction(prog, src->getInsn());
   3139             }
   3140 
   3141             break;
   3142          }
   3143          break;
   3144       default:
   3145          break;
   3146       }
   3147    }
   3148 
   3149    return true;
   3150 }
   3151 
   3152 // =============================================================================
   3153 
   3154 // Common subexpression elimination. Stupid O^2 implementation.
   3155 class LocalCSE : public Pass
   3156 {
   3157 private:
   3158    virtual bool visit(BasicBlock *);
   3159 
   3160    inline bool tryReplace(Instruction **, Instruction *);
   3161 
   3162    DLList ops[OP_LAST + 1];
   3163 };
   3164 
   3165 class GlobalCSE : public Pass
   3166 {
   3167 private:
   3168    virtual bool visit(BasicBlock *);
   3169 };
   3170 
   3171 bool
   3172 Instruction::isActionEqual(const Instruction *that) const
   3173 {
   3174    if (this->op != that->op ||
   3175        this->dType != that->dType ||
   3176        this->sType != that->sType)
   3177       return false;
   3178    if (this->cc != that->cc)
   3179       return false;
   3180 
   3181    if (this->asTex()) {
   3182       if (memcmp(&this->asTex()->tex,
   3183                  &that->asTex()->tex,
   3184                  sizeof(this->asTex()->tex)))
   3185          return false;
   3186    } else
   3187    if (this->asCmp()) {
   3188       if (this->asCmp()->setCond != that->asCmp()->setCond)
   3189          return false;
   3190    } else
   3191    if (this->asFlow()) {
   3192       return false;
   3193    } else {
   3194       if (this->ipa != that->ipa ||
   3195           this->lanes != that->lanes ||
   3196           this->perPatch != that->perPatch)
   3197          return false;
   3198       if (this->postFactor != that->postFactor)
   3199          return false;
   3200    }
   3201 
   3202    if (this->subOp != that->subOp ||
   3203        this->saturate != that->saturate ||
   3204        this->rnd != that->rnd ||
   3205        this->ftz != that->ftz ||
   3206        this->dnz != that->dnz ||
   3207        this->cache != that->cache ||
   3208        this->mask != that->mask)
   3209       return false;
   3210 
   3211    return true;
   3212 }
   3213 
   3214 bool
   3215 Instruction::isResultEqual(const Instruction *that) const
   3216 {
   3217    unsigned int d, s;
   3218 
   3219    // NOTE: location of discard only affects tex with liveOnly and quadops
   3220    if (!this->defExists(0) && this->op != OP_DISCARD)
   3221       return false;
   3222 
   3223    if (!isActionEqual(that))
   3224       return false;
   3225 
   3226    if (this->predSrc != that->predSrc)
   3227       return false;
   3228 
   3229    for (d = 0; this->defExists(d); ++d) {
   3230       if (!that->defExists(d) ||
   3231           !this->getDef(d)->equals(that->getDef(d), false))
   3232          return false;
   3233    }
   3234    if (that->defExists(d))
   3235       return false;
   3236 
   3237    for (s = 0; this->srcExists(s); ++s) {
   3238       if (!that->srcExists(s))
   3239          return false;
   3240       if (this->src(s).mod != that->src(s).mod)
   3241          return false;
   3242       if (!this->getSrc(s)->equals(that->getSrc(s), true))
   3243          return false;
   3244    }
   3245    if (that->srcExists(s))
   3246       return false;
   3247 
   3248    if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
   3249       switch (src(0).getFile()) {
   3250       case FILE_MEMORY_CONST:
   3251       case FILE_SHADER_INPUT:
   3252          return true;
   3253       case FILE_SHADER_OUTPUT:
   3254          return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
   3255       default:
   3256          return false;
   3257       }
   3258    }
   3259 
   3260    return true;
   3261 }
   3262 
   3263 // pull through common expressions from different in-blocks
   3264 bool
   3265 GlobalCSE::visit(BasicBlock *bb)
   3266 {
   3267    Instruction *phi, *next, *ik;
   3268    int s;
   3269 
   3270    // TODO: maybe do this with OP_UNION, too
   3271 
   3272    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
   3273       next = phi->next;
   3274       if (phi->getSrc(0)->refCount() > 1)
   3275          continue;
   3276       ik = phi->getSrc(0)->getInsn();
   3277       if (!ik)
   3278          continue; // probably a function input
   3279       if (ik->defCount(0xff) > 1)
   3280          continue; // too painful to check if we can really push this forward
   3281       for (s = 1; phi->srcExists(s); ++s) {
   3282          if (phi->getSrc(s)->refCount() > 1)
   3283             break;
   3284          if (!phi->getSrc(s)->getInsn() ||
   3285              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
   3286             break;
   3287       }
   3288       if (!phi->srcExists(s)) {
   3289          Instruction *entry = bb->getEntry();
   3290          ik->bb->remove(ik);
   3291          if (!entry || entry->op != OP_JOIN)
   3292             bb->insertHead(ik);
   3293          else
   3294             bb->insertAfter(entry, ik);
   3295          ik->setDef(0, phi->getDef(0));
   3296          delete_Instruction(prog, phi);
   3297       }
   3298    }
   3299 
   3300    return true;
   3301 }
   3302 
   3303 bool
   3304 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
   3305 {
   3306    Instruction *old = *ptr;
   3307 
   3308    // TODO: maybe relax this later (causes trouble with OP_UNION)
   3309    if (i->isPredicated())
   3310       return false;
   3311 
   3312    if (!old->isResultEqual(i))
   3313       return false;
   3314 
   3315    for (int d = 0; old->defExists(d); ++d)
   3316       old->def(d).replace(i->getDef(d), false);
   3317    delete_Instruction(prog, old);
   3318    *ptr = NULL;
   3319    return true;
   3320 }
   3321 
   3322 bool
   3323 LocalCSE::visit(BasicBlock *bb)
   3324 {
   3325    unsigned int replaced;
   3326 
   3327    do {
   3328       Instruction *ir, *next;
   3329 
   3330       replaced = 0;
   3331 
   3332       // will need to know the order of instructions
   3333       int serial = 0;
   3334       for (ir = bb->getFirst(); ir; ir = ir->next)
   3335          ir->serial = serial++;
   3336 
   3337       for (ir = bb->getFirst(); ir; ir = next) {
   3338          int s;
   3339          Value *src = NULL;
   3340 
   3341          next = ir->next;
   3342 
   3343          if (ir->fixed) {
   3344             ops[ir->op].insert(ir);
   3345             continue;
   3346          }
   3347 
   3348          for (s = 0; ir->srcExists(s); ++s)
   3349             if (ir->getSrc(s)->asLValue())
   3350                if (!src || ir->getSrc(s)->refCount() < src->refCount())
   3351                   src = ir->getSrc(s);
   3352 
   3353          if (src) {
   3354             for (Value::UseIterator it = src->uses.begin();
   3355                  it != src->uses.end(); ++it) {
   3356                Instruction *ik = (*it)->getInsn();
   3357                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
   3358                   if (tryReplace(&ir, ik))
   3359                      break;
   3360             }
   3361          } else {
   3362             DLLIST_FOR_EACH(&ops[ir->op], iter)
   3363             {
   3364                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
   3365                if (tryReplace(&ir, ik))
   3366                   break;
   3367             }
   3368          }
   3369 
   3370          if (ir)
   3371             ops[ir->op].insert(ir);
   3372          else
   3373             ++replaced;
   3374       }
   3375       for (unsigned int i = 0; i <= OP_LAST; ++i)
   3376          ops[i].clear();
   3377 
   3378    } while (replaced);
   3379 
   3380    return true;
   3381 }
   3382 
   3383 // =============================================================================
   3384 
   3385 // Remove computations of unused values.
   3386 class DeadCodeElim : public Pass
   3387 {
   3388 public:
   3389    bool buryAll(Program *);
   3390 
   3391 private:
   3392    virtual bool visit(BasicBlock *);
   3393 
   3394    void checkSplitLoad(Instruction *ld); // for partially dead loads
   3395 
   3396    unsigned int deadCount;
   3397 };
   3398 
   3399 bool
   3400 DeadCodeElim::buryAll(Program *prog)
   3401 {
   3402    do {
   3403       deadCount = 0;
   3404       if (!this->run(prog, false, false))
   3405          return false;
   3406    } while (deadCount);
   3407 
   3408    return true;
   3409 }
   3410 
   3411 bool
   3412 DeadCodeElim::visit(BasicBlock *bb)
   3413 {
   3414    Instruction *prev;
   3415 
   3416    for (Instruction *i = bb->getExit(); i; i = prev) {
   3417       prev = i->prev;
   3418       if (i->isDead()) {
   3419          ++deadCount;
   3420          delete_Instruction(prog, i);
   3421       } else
   3422       if (i->defExists(1) &&
   3423           i->subOp == 0 &&
   3424           (i->op == OP_VFETCH || i->op == OP_LOAD)) {
   3425          checkSplitLoad(i);
   3426       } else
   3427       if (i->defExists(0) && !i->getDef(0)->refCount()) {
   3428          if (i->op == OP_ATOM ||
   3429              i->op == OP_SUREDP ||
   3430              i->op == OP_SUREDB) {
   3431             i->setDef(0, NULL);
   3432          } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
   3433             i->setDef(0, i->getDef(1));
   3434             i->setDef(1, NULL);
   3435          }
   3436       }
   3437    }
   3438    return true;
   3439 }
   3440 
   3441 // Each load can go into up to 4 destinations, any of which might potentially
   3442 // be dead (i.e. a hole). These can always be split into 2 loads, independent
   3443 // of where the holes are. We find the first contiguous region, put it into
   3444 // the first load, and then put the second contiguous region into the second
   3445 // load. There can be at most 2 contiguous regions.
   3446 //
   3447 // Note that there are some restrictions, for example it's not possible to do
   3448 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
   3449 // up. Also hardware doesn't support 96-bit loads, so those also have to be
   3450 // split into a 64-bit and 32-bit load.
   3451 void
   3452 DeadCodeElim::checkSplitLoad(Instruction *ld1)
   3453 {
   3454    Instruction *ld2 = NULL; // can get at most 2 loads
   3455    Value *def1[4];
   3456    Value *def2[4];
   3457    int32_t addr1, addr2;
   3458    int32_t size1, size2;
   3459    int d, n1, n2;
   3460    uint32_t mask = 0xffffffff;
   3461 
   3462    for (d = 0; ld1->defExists(d); ++d)
   3463       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
   3464          mask &= ~(1 << d);
   3465    if (mask == 0xffffffff)
   3466       return;
   3467 
   3468    addr1 = ld1->getSrc(0)->reg.data.offset;
   3469    n1 = n2 = 0;
   3470    size1 = size2 = 0;
   3471 
   3472    // Compute address/width for first load
   3473    for (d = 0; ld1->defExists(d); ++d) {
   3474       if (mask & (1 << d)) {
   3475          if (size1 && (addr1 & 0x7))
   3476             break;
   3477          def1[n1] = ld1->getDef(d);
   3478          size1 += def1[n1++]->reg.size;
   3479       } else
   3480       if (!n1) {
   3481          addr1 += ld1->getDef(d)->reg.size;
   3482       } else {
   3483          break;
   3484       }
   3485    }
   3486 
   3487    // Scale back the size of the first load until it can be loaded. This
   3488    // typically happens for TYPE_B96 loads.
   3489    while (n1 &&
   3490           !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
   3491                                                 typeOfSize(size1))) {
   3492       size1 -= def1[--n1]->reg.size;
   3493       d--;
   3494    }
   3495 
   3496    // Compute address/width for second load
   3497    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
   3498       if (mask & (1 << d)) {
   3499          assert(!size2 || !(addr2 & 0x7));
   3500          def2[n2] = ld1->getDef(d);
   3501          size2 += def2[n2++]->reg.size;
   3502       } else if (!n2) {
   3503          assert(!n2);
   3504          addr2 += ld1->getDef(d)->reg.size;
   3505       } else {
   3506          break;
   3507       }
   3508    }
   3509 
   3510    // Make sure that we've processed all the values
   3511    for (; ld1->defExists(d); ++d)
   3512       assert(!(mask & (1 << d)));
   3513 
   3514    updateLdStOffset(ld1, addr1, func);
   3515    ld1->setType(typeOfSize(size1));
   3516    for (d = 0; d < 4; ++d)
   3517       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
   3518 
   3519    if (!n2)
   3520       return;
   3521 
   3522    ld2 = cloneShallow(func, ld1);
   3523    updateLdStOffset(ld2, addr2, func);
   3524    ld2->setType(typeOfSize(size2));
   3525    for (d = 0; d < 4; ++d)
   3526       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
   3527 
   3528    ld1->bb->insertAfter(ld1, ld2);
   3529 }
   3530 
   3531 // =============================================================================
   3532 
   3533 #define RUN_PASS(l, n, f)                       \
   3534    if (level >= (l)) {                          \
   3535       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
   3536          INFO("PEEPHOLE: %s\n", #n);            \
   3537       n pass;                                   \
   3538       if (!pass.f(this))                        \
   3539          return false;                          \
   3540    }
   3541 
   3542 bool
   3543 Program::optimizeSSA(int level)
   3544 {
   3545    RUN_PASS(1, DeadCodeElim, buryAll);
   3546    RUN_PASS(1, CopyPropagation, run);
   3547    RUN_PASS(1, MergeSplits, run);
   3548    RUN_PASS(2, GlobalCSE, run);
   3549    RUN_PASS(1, LocalCSE, run);
   3550    RUN_PASS(2, AlgebraicOpt, run);
   3551    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
   3552    RUN_PASS(1, ConstantFolding, foldAll);
   3553    RUN_PASS(2, LateAlgebraicOpt, run);
   3554    RUN_PASS(1, LoadPropagation, run);
   3555    RUN_PASS(1, IndirectPropagation, run);
   3556    RUN_PASS(2, MemoryOpt, run);
   3557    RUN_PASS(2, LocalCSE, run);
   3558    RUN_PASS(0, DeadCodeElim, buryAll);
   3559 
   3560    return true;
   3561 }
   3562 
   3563 bool
   3564 Program::optimizePostRA(int level)
   3565 {
   3566    RUN_PASS(2, FlatteningPass, run);
   3567    if (getTarget()->getChipset() < 0xc0)
   3568       RUN_PASS(2, NV50PostRaConstantFolding, run);
   3569 
   3570    return true;
   3571 }
   3572 
   3573 }
   3574