1 /* 2 * Copyright 2011 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "codegen/nv50_ir.h" 24 #include "codegen/nv50_ir_build_util.h" 25 26 #include "codegen/nv50_ir_target_nv50.h" 27 28 namespace nv50_ir { 29 30 // nv50 doesn't support 32 bit integer multiplication 31 // 32 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl) 33 // ------------------- 34 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) + 35 // ah*bh 00 00 ( carry1) << 16 + ( carry2) 36 // al*bl 37 // ah*bl 00 38 // 39 // fffe0001 + fffe0001 40 // 41 // Note that this sort of splitting doesn't work for signed values, so we 42 // compute the sign on those manually and then perform an unsigned multiply. 43 static bool 44 expandIntegerMUL(BuildUtil *bld, Instruction *mul) 45 { 46 const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH; 47 ImmediateValue src1; 48 bool src1imm = mul->src(1).getImmediate(src1); 49 50 DataType fTy; // full type 51 switch (mul->sType) { 52 case TYPE_S32: fTy = TYPE_U32; break; 53 case TYPE_S64: fTy = TYPE_U64; break; 54 default: fTy = mul->sType; break; 55 } 56 57 DataType hTy; // half type 58 switch (fTy) { 59 case TYPE_U32: hTy = TYPE_U16; break; 60 case TYPE_U64: hTy = TYPE_U32; break; 61 default: 62 return false; 63 } 64 unsigned int fullSize = typeSizeof(fTy); 65 unsigned int halfSize = typeSizeof(hTy); 66 67 Instruction *i[9]; 68 69 bld->setPosition(mul, true); 70 71 Value *s[2]; 72 Value *a[2], *b[2]; 73 Value *t[4]; 74 for (int j = 0; j < 4; ++j) 75 t[j] = bld->getSSA(fullSize); 76 77 if (isSignedType(mul->sType) && highResult) { 78 s[0] = bld->getSSA(fullSize); 79 s[1] = bld->getSSA(fullSize); 80 bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0)); 81 bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1)); 82 src1.reg.data.s32 = abs(src1.reg.data.s32); 83 } else { 84 s[0] = mul->getSrc(0); 85 s[1] = mul->getSrc(1); 86 } 87 88 // split sources into halves 89 i[0] = bld->mkSplit(a, halfSize, s[0]); 90 i[1] = bld->mkSplit(b, halfSize, s[1]); 91 92 if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) { 93 i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1], 94 bld->mkImm(src1.reg.data.u32 & 0xffff)); 95 } else { 96 i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], 97 src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]); 98 if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) { 99 i[3] = i[2]; 100 t[1] = t[0]; 101 } else { 102 i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); 103 } 104 } 105 i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8)); 106 if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) { 107 i[4] = i[3]; 108 t[3] = t[2]; 109 } else { 110 i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); 111 } 112 113 if (highResult) { 114 Value *c[2]; 115 Value *r[5]; 116 Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8)); 117 c[0] = bld->getSSA(1, FILE_FLAGS); 118 c[1] = bld->getSSA(1, FILE_FLAGS); 119 for (int j = 0; j < 5; ++j) 120 r[j] = bld->getSSA(fullSize); 121 122 i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8)); 123 i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm); 124 bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]); 125 bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]); 126 i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]); 127 128 // set carry defs / sources 129 i[3]->setFlagsDef(1, c[0]); 130 // actual result required in negative case, but ignored for 131 // unsigned. for some reason the compiler ends up dropping the whole 132 // instruction if the destination is unused but the flags are. 133 if (isSignedType(mul->sType)) 134 i[4]->setFlagsDef(1, c[1]); 135 else 136 i[4]->setFlagsDef(0, c[1]); 137 i[6]->setPredicate(CC_C, c[0]); 138 i[5]->setFlagsSrc(3, c[1]); 139 140 if (isSignedType(mul->sType)) { 141 Value *cc[2]; 142 Value *rr[7]; 143 Value *one = bld->getSSA(fullSize); 144 bld->loadImm(one, 1); 145 for (int j = 0; j < 7; j++) 146 rr[j] = bld->getSSA(fullSize); 147 148 // NOTE: this logic uses predicates because splitting basic blocks is 149 // ~impossible during the SSA phase. The RA relies on a correlation 150 // between edge order and phi node sources. 151 152 // Set the sign of the result based on the inputs 153 bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1)) 154 ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS))); 155 156 // 1s complement of 64-bit value 157 bld->mkOp1(OP_NOT, fTy, rr[0], r[4]) 158 ->setPredicate(CC_S, cc[0]); 159 bld->mkOp1(OP_NOT, fTy, rr[1], t[3]) 160 ->setPredicate(CC_S, cc[0]); 161 162 // add to low 32-bits, keep track of the carry 163 Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one); 164 n->setPredicate(CC_S, cc[0]); 165 n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS))); 166 167 // If there was a carry, add 1 to the upper 32 bits 168 // XXX: These get executed even if they shouldn't be 169 bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one) 170 ->setPredicate(CC_C, cc[1]); 171 bld->mkMov(rr[3], rr[0]) 172 ->setPredicate(CC_NC, cc[1]); 173 bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]); 174 175 // Merge the results from the negative and non-negative paths 176 bld->mkMov(rr[5], rr[4]) 177 ->setPredicate(CC_S, cc[0]); 178 bld->mkMov(rr[6], r[4]) 179 ->setPredicate(CC_NS, cc[0]); 180 bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]); 181 } else { 182 bld->mkMov(mul->getDef(0), r[4]); 183 } 184 } else { 185 bld->mkMov(mul->getDef(0), t[3]); 186 } 187 delete_Instruction(bld->getProgram(), mul); 188 189 for (int j = 2; j <= (highResult ? 5 : 4); ++j) 190 if (i[j]) 191 i[j]->sType = hTy; 192 193 return true; 194 } 195 196 #define QOP_ADD 0 197 #define QOP_SUBR 1 198 #define QOP_SUB 2 199 #define QOP_MOV2 3 200 201 // UL UR LL LR 202 #define QUADOP(q, r, s, t) \ 203 ((QOP_##q << 6) | (QOP_##r << 4) | \ 204 (QOP_##s << 2) | (QOP_##t << 0)) 205 206 class NV50LegalizePostRA : public Pass 207 { 208 private: 209 virtual bool visit(Function *); 210 virtual bool visit(BasicBlock *); 211 212 void handlePRERET(FlowInstruction *); 213 void replaceZero(Instruction *); 214 215 LValue *r63; 216 }; 217 218 bool 219 NV50LegalizePostRA::visit(Function *fn) 220 { 221 Program *prog = fn->getProgram(); 222 223 r63 = new_LValue(fn, FILE_GPR); 224 // GPR units on nv50 are in half-regs 225 if (prog->maxGPR < 126) 226 r63->reg.data.id = 63; 227 else 228 r63->reg.data.id = 127; 229 230 // this is actually per-program, but we can do it all on visiting main() 231 std::list<Instruction *> *outWrites = 232 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 233 234 if (outWrites) { 235 for (std::list<Instruction *>::iterator it = outWrites->begin(); 236 it != outWrites->end(); ++it) 237 (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0)); 238 // instructions will be deleted on exit 239 outWrites->clear(); 240 } 241 242 return true; 243 } 244 245 void 246 NV50LegalizePostRA::replaceZero(Instruction *i) 247 { 248 for (int s = 0; i->srcExists(s); ++s) { 249 ImmediateValue *imm = i->getSrc(s)->asImm(); 250 if (imm && imm->reg.data.u64 == 0) 251 i->setSrc(s, r63); 252 } 253 } 254 255 // Emulate PRERET: jump to the target and call to the origin from there 256 // 257 // WARNING: atm only works if BBs are affected by at most a single PRERET 258 // 259 // BB:0 260 // preret BB:3 261 // (...) 262 // BB:3 263 // (...) 264 // ---> 265 // BB:0 266 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate) 267 // (...) 268 // BB:3 269 // bra BB:3 + n1 (skip the call) 270 // call BB:0 + n2 (skip bra at beginning of BB:0) 271 // (...) 272 void 273 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre) 274 { 275 BasicBlock *bbE = pre->bb; 276 BasicBlock *bbT = pre->target.bb; 277 278 pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0; 279 bbE->remove(pre); 280 bbE->insertHead(pre); 281 282 Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT); 283 Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE); 284 285 bbT->insertHead(call); 286 bbT->insertHead(skip); 287 288 // NOTE: maybe split blocks to prevent the instructions from moving ? 289 290 skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1; 291 call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2; 292 } 293 294 bool 295 NV50LegalizePostRA::visit(BasicBlock *bb) 296 { 297 Instruction *i, *next; 298 299 // remove pseudo operations and non-fixed no-ops, split 64 bit operations 300 for (i = bb->getFirst(); i; i = next) { 301 next = i->next; 302 if (i->isNop()) { 303 bb->remove(i); 304 } else 305 if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) { 306 handlePRERET(i->asFlow()); 307 } else { 308 // TODO: We will want to do this before register allocation, 309 // since have to use a $c register for the carry flag. 310 if (typeSizeof(i->dType) == 8) { 311 Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL); 312 if (hi) 313 next = hi; 314 } 315 316 if (i->op != OP_PFETCH && i->op != OP_BAR && 317 (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS)) 318 replaceZero(i); 319 } 320 } 321 if (!bb->getEntry()) 322 return true; 323 324 return true; 325 } 326 327 class NV50LegalizeSSA : public Pass 328 { 329 public: 330 NV50LegalizeSSA(Program *); 331 332 virtual bool visit(BasicBlock *bb); 333 334 private: 335 void propagateWriteToOutput(Instruction *); 336 void handleDIV(Instruction *); 337 void handleMOD(Instruction *); 338 void handleMUL(Instruction *); 339 void handleAddrDef(Instruction *); 340 341 inline bool isARL(const Instruction *) const; 342 343 BuildUtil bld; 344 345 std::list<Instruction *> *outWrites; 346 }; 347 348 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog) 349 { 350 bld.setProgram(prog); 351 352 if (prog->optLevel >= 2 && 353 (prog->getType() == Program::TYPE_GEOMETRY || 354 prog->getType() == Program::TYPE_VERTEX)) 355 outWrites = 356 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 357 else 358 outWrites = NULL; 359 } 360 361 void 362 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st) 363 { 364 if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1) 365 return; 366 367 // check def instruction can store 368 Instruction *di = st->getSrc(1)->defs.front()->getInsn(); 369 370 // TODO: move exports (if beneficial) in common opt pass 371 if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1) 372 return; 373 374 for (int s = 0; di->srcExists(s); ++s) 375 if (di->src(s).getFile() == FILE_IMMEDIATE || 376 di->src(s).getFile() == FILE_MEMORY_LOCAL) 377 return; 378 379 if (prog->getType() == Program::TYPE_GEOMETRY) { 380 // Only propagate output writes in geometry shaders when we can be sure 381 // that we are propagating to the same output vertex. 382 if (di->bb != st->bb) 383 return; 384 Instruction *i; 385 for (i = di; i != st; i = i->next) { 386 if (i->op == OP_EMIT || i->op == OP_RESTART) 387 return; 388 } 389 assert(i); // st after di 390 } 391 392 // We cannot set defs to non-lvalues before register allocation, so 393 // save & remove (to save registers) the exports and replace later. 394 outWrites->push_back(st); 395 st->bb->remove(st); 396 } 397 398 bool 399 NV50LegalizeSSA::isARL(const Instruction *i) const 400 { 401 ImmediateValue imm; 402 403 if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR) 404 return false; 405 if (!i->src(1).getImmediate(imm)) 406 return false; 407 return imm.isInteger(0); 408 } 409 410 void 411 NV50LegalizeSSA::handleAddrDef(Instruction *i) 412 { 413 Instruction *arl; 414 415 i->getDef(0)->reg.size = 2; // $aX are only 16 bit 416 417 // PFETCH can always write to $a 418 if (i->op == OP_PFETCH) 419 return; 420 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid 421 if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) { 422 if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) 423 return; 424 if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS) 425 return; 426 } 427 428 // turn $a sources into $r sources (can't operate on $a) 429 for (int s = 0; i->srcExists(s); ++s) { 430 Value *a = i->getSrc(s); 431 Value *r; 432 if (a->reg.file == FILE_ADDRESS) { 433 if (a->getInsn() && isARL(a->getInsn())) { 434 i->setSrc(s, a->getInsn()->getSrc(0)); 435 } else { 436 bld.setPosition(i, false); 437 r = bld.getSSA(); 438 bld.mkMov(r, a); 439 i->setSrc(s, r); 440 } 441 } 442 } 443 if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE) 444 return; 445 446 // turn result back into $a 447 bld.setPosition(i, true); 448 arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0)); 449 i->setDef(0, arl->getSrc(0)); 450 } 451 452 void 453 NV50LegalizeSSA::handleMUL(Instruction *mul) 454 { 455 if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2) 456 return; 457 Value *def = mul->getDef(0); 458 Value *pred = mul->getPredicate(); 459 CondCode cc = mul->cc; 460 if (pred) 461 mul->setPredicate(CC_ALWAYS, NULL); 462 463 if (mul->op == OP_MAD) { 464 Instruction *add = mul; 465 bld.setPosition(add, false); 466 Value *res = cloneShallow(func, mul->getDef(0)); 467 mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1)); 468 add->op = OP_ADD; 469 add->setSrc(0, mul->getDef(0)); 470 add->setSrc(1, add->getSrc(2)); 471 for (int s = 2; add->srcExists(s); ++s) 472 add->setSrc(s, NULL); 473 mul->subOp = add->subOp; 474 add->subOp = 0; 475 } 476 expandIntegerMUL(&bld, mul); 477 if (pred) 478 def->getInsn()->setPredicate(cc, pred); 479 } 480 481 // Use f32 division: first compute an approximate result, use it to reduce 482 // the dividend, which should then be representable as f32, divide the reduced 483 // dividend, and add the quotients. 484 void 485 NV50LegalizeSSA::handleDIV(Instruction *div) 486 { 487 const DataType ty = div->sType; 488 489 if (ty != TYPE_U32 && ty != TYPE_S32) 490 return; 491 492 Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond; 493 494 bld.setPosition(div, false); 495 496 Value *a, *af = bld.getSSA(); 497 Value *b, *bf = bld.getSSA(); 498 499 bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0)); 500 bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1)); 501 502 if (isSignedType(ty)) { 503 af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); 504 bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS); 505 a = bld.getSSA(); 506 b = bld.getSSA(); 507 bld.mkOp1(OP_ABS, ty, a, div->getSrc(0)); 508 bld.mkOp1(OP_ABS, ty, b, div->getSrc(1)); 509 } else { 510 a = div->getSrc(0); 511 b = div->getSrc(1); 512 } 513 514 bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf); 515 bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2)); 516 517 bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z; 518 bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z; 519 520 // get error of 1st result 521 expandIntegerMUL(&bld, 522 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b)); 523 bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t); 524 525 bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf); 526 527 bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z; 528 bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf) 529 ->rnd = ROUND_Z; 530 bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients 531 532 // correction: if modulus >= divisor, add 1 533 expandIntegerMUL(&bld, 534 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b)); 535 bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t); 536 bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b); 537 if (!isSignedType(ty)) { 538 div->op = OP_SUB; 539 div->setSrc(0, q); 540 div->setSrc(1, s); 541 } else { 542 t = q; 543 bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s); 544 s = bld.getSSA(); 545 t = bld.getSSA(); 546 // fix the sign 547 bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1)) 548 ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS))); 549 bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond); 550 bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond); 551 552 div->op = OP_UNION; 553 div->setSrc(0, s); 554 div->setSrc(1, t); 555 } 556 } 557 558 void 559 NV50LegalizeSSA::handleMOD(Instruction *mod) 560 { 561 if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32) 562 return; 563 bld.setPosition(mod, false); 564 565 Value *q = bld.getSSA(); 566 Value *m = bld.getSSA(); 567 568 bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1)); 569 handleDIV(q->getInsn()); 570 571 bld.setPosition(mod, false); 572 expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1))); 573 574 mod->op = OP_SUB; 575 mod->setSrc(1, m); 576 } 577 578 bool 579 NV50LegalizeSSA::visit(BasicBlock *bb) 580 { 581 Instruction *insn, *next; 582 // skipping PHIs (don't pass them to handleAddrDef) ! 583 for (insn = bb->getEntry(); insn; insn = next) { 584 next = insn->next; 585 586 if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS) 587 handleAddrDef(insn); 588 589 switch (insn->op) { 590 case OP_EXPORT: 591 if (outWrites) 592 propagateWriteToOutput(insn); 593 break; 594 case OP_DIV: 595 handleDIV(insn); 596 break; 597 case OP_MOD: 598 handleMOD(insn); 599 break; 600 case OP_MAD: 601 case OP_MUL: 602 handleMUL(insn); 603 break; 604 default: 605 break; 606 } 607 } 608 return true; 609 } 610 611 class NV50LoweringPreSSA : public Pass 612 { 613 public: 614 NV50LoweringPreSSA(Program *); 615 616 private: 617 virtual bool visit(Instruction *); 618 virtual bool visit(Function *); 619 620 bool handleRDSV(Instruction *); 621 bool handleWRSV(Instruction *); 622 623 bool handlePFETCH(Instruction *); 624 bool handleEXPORT(Instruction *); 625 bool handleLOAD(Instruction *); 626 627 bool handleDIV(Instruction *); 628 bool handleSQRT(Instruction *); 629 bool handlePOW(Instruction *); 630 631 bool handleSET(Instruction *); 632 bool handleSLCT(CmpInstruction *); 633 bool handleSELP(Instruction *); 634 635 bool handleTEX(TexInstruction *); 636 bool handleTXB(TexInstruction *); // I really 637 bool handleTXL(TexInstruction *); // hate 638 bool handleTXD(TexInstruction *); // these 3 639 bool handleTXLQ(TexInstruction *); 640 bool handleTXQ(TexInstruction *); 641 642 bool handleCALL(Instruction *); 643 bool handlePRECONT(Instruction *); 644 bool handleCONT(Instruction *); 645 646 void checkPredicate(Instruction *); 647 void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y); 648 void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy); 649 650 private: 651 const Target *const targ; 652 653 BuildUtil bld; 654 655 Value *tid; 656 }; 657 658 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) : 659 targ(prog->getTarget()), tid(NULL) 660 { 661 bld.setProgram(prog); 662 } 663 664 bool 665 NV50LoweringPreSSA::visit(Function *f) 666 { 667 BasicBlock *root = BasicBlock::get(func->cfg.getRoot()); 668 669 if (prog->getType() == Program::TYPE_COMPUTE) { 670 // Add implicit "thread id" argument in $r0 to the function 671 Value *arg = new_LValue(func, FILE_GPR); 672 arg->reg.data.id = 0; 673 f->ins.push_back(arg); 674 675 bld.setPosition(root, false); 676 tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0); 677 } 678 679 return true; 680 } 681 682 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms, 683 Value **ms_x, Value **ms_y) { 684 // This loads the texture-indexed ms setting from the constant buffer 685 Value *tmp = new_LValue(func, FILE_GPR); 686 uint8_t b = prog->driver->io.auxCBSlot; 687 off += prog->driver->io.suInfoBase; 688 if (prog->getType() > Program::TYPE_VERTEX) 689 off += 16 * 2 * 4; 690 if (prog->getType() > Program::TYPE_GEOMETRY) 691 off += 16 * 2 * 4; 692 *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol( 693 FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL); 694 *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol( 695 FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL); 696 *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y); 697 } 698 699 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) { 700 // Given a MS level, and a sample id, compute the delta x/y 701 uint8_t b = prog->driver->io.msInfoCBSlot; 702 Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR); 703 704 // The required information is at mslevel * 16 * 4 + sample * 8 705 // = (mslevel * 8 + sample) * 8 706 bld.mkOp2(OP_SHL, 707 TYPE_U32, 708 off, 709 bld.mkOp2v(OP_ADD, TYPE_U32, t, 710 bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)), 711 s), 712 bld.mkImm(3)); 713 *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol( 714 FILE_MEMORY_CONST, b, TYPE_U32, 715 prog->driver->io.msInfoBase), off); 716 *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol( 717 FILE_MEMORY_CONST, b, TYPE_U32, 718 prog->driver->io.msInfoBase + 4), off); 719 } 720 721 bool 722 NV50LoweringPreSSA::handleTEX(TexInstruction *i) 723 { 724 const int arg = i->tex.target.getArgCount(); 725 const int dref = arg; 726 const int lod = i->tex.target.isShadow() ? (arg + 1) : arg; 727 728 /* Only normalize in the non-explicit derivatives case. 729 */ 730 if (i->tex.target.isCube() && i->op != OP_TXD) { 731 Value *src[3], *val; 732 int c; 733 for (c = 0; c < 3; ++c) 734 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c)); 735 val = bld.getScratch(); 736 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 737 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 738 bld.mkOp1(OP_RCP, TYPE_F32, val, val); 739 for (c = 0; c < 3; ++c) { 740 i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), 741 i->getSrc(c), val)); 742 } 743 } 744 745 // handle MS, which means looking up the MS params for this texture, and 746 // adjusting the input coordinates to point at the right sample. 747 if (i->tex.target.isMS()) { 748 Value *x = i->getSrc(0); 749 Value *y = i->getSrc(1); 750 Value *s = i->getSrc(arg - 1); 751 Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR), 752 *ms, *ms_x, *ms_y, *dx, *dy; 753 754 i->tex.target.clearMS(); 755 756 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y); 757 loadMsInfo(ms, s, &dx, &dy); 758 759 bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x); 760 bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y); 761 bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx); 762 bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy); 763 i->setSrc(0, tx); 764 i->setSrc(1, ty); 765 i->setSrc(arg - 1, bld.loadImm(NULL, 0)); 766 } 767 768 // dref comes before bias/lod 769 if (i->tex.target.isShadow()) 770 if (i->op == OP_TXB || i->op == OP_TXL) 771 i->swapSources(dref, lod); 772 773 if (i->tex.target.isArray()) { 774 if (i->op != OP_TXF) { 775 // array index must be converted to u32, but it's already an integer 776 // for TXF 777 Value *layer = i->getSrc(arg - 1); 778 LValue *src = new_LValue(func, FILE_GPR); 779 bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer); 780 bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511)); 781 i->setSrc(arg - 1, src); 782 } 783 if (i->tex.target.isCube() && i->srcCount() > 4) { 784 std::vector<Value *> acube, a2d; 785 int c; 786 787 acube.resize(4); 788 for (c = 0; c < 4; ++c) 789 acube[c] = i->getSrc(c); 790 a2d.resize(4); 791 for (c = 0; c < 3; ++c) 792 a2d[c] = new_LValue(func, FILE_GPR); 793 a2d[3] = NULL; 794 795 bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s, 796 a2d, acube)->asTex()->tex.mask = 0x7; 797 798 for (c = 0; c < 3; ++c) 799 i->setSrc(c, a2d[c]); 800 for (; i->srcExists(c + 1); ++c) 801 i->setSrc(c, i->getSrc(c + 1)); 802 i->setSrc(c, NULL); 803 assert(c <= 4); 804 805 i->tex.target = i->tex.target.isShadow() ? 806 TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY; 807 } 808 } 809 810 // texel offsets are 3 immediate fields in the instruction, 811 // nv50 cannot do textureGatherOffsets 812 assert(i->tex.useOffsets <= 1); 813 if (i->tex.useOffsets) { 814 for (int c = 0; c < 3; ++c) { 815 ImmediateValue val; 816 if (!i->offset[0][c].getImmediate(val)) 817 assert(!"non-immediate offset"); 818 i->tex.offset[c] = val.reg.data.u32; 819 i->offset[0][c].set(NULL); 820 } 821 } 822 823 return true; 824 } 825 826 // Bias must be equal for all threads of a quad or lod calculation will fail. 827 // 828 // The lanes of a quad are grouped by the bit in the condition register they 829 // have set, which is selected by differing bias values. 830 // Move the input values for TEX into a new register set for each group and 831 // execute TEX only for a specific group. 832 // We always need to use 4 new registers for the inputs/outputs because the 833 // implicitly calculated derivatives must be correct. 834 // 835 // TODO: move to SSA phase so we can easily determine whether bias is constant 836 bool 837 NV50LoweringPreSSA::handleTXB(TexInstruction *i) 838 { 839 const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O }; 840 int l, d; 841 842 // We can't actually apply bias *and* do a compare for a cube 843 // texture. Since the compare has to be done before the filtering, just 844 // drop the bias on the floor. 845 if (i->tex.target == TEX_TARGET_CUBE_SHADOW) { 846 i->op = OP_TEX; 847 i->setSrc(3, i->getSrc(4)); 848 i->setSrc(4, NULL); 849 return handleTEX(i); 850 } 851 852 handleTEX(i); 853 Value *bias = i->getSrc(i->tex.target.getArgCount()); 854 if (bias->isUniform()) 855 return true; 856 857 Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(), 858 bld.loadImm(NULL, 1)); 859 bld.setPosition(cond, false); 860 861 for (l = 1; l < 4; ++l) { 862 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); 863 Value *bit = bld.getSSA(); 864 Value *pred = bld.getScratch(1, FILE_FLAGS); 865 Value *imm = bld.loadImm(NULL, (1 << l)); 866 bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0; 867 bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred); 868 cond->setSrc(l, bit); 869 } 870 Value *flags = bld.getScratch(1, FILE_FLAGS); 871 bld.setPosition(cond, true); 872 bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0; 873 874 Instruction *tex[4]; 875 for (l = 0; l < 4; ++l) { 876 (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags); 877 bld.insert(tex[l]); 878 } 879 880 Value *res[4][4]; 881 for (d = 0; i->defExists(d); ++d) 882 res[0][d] = tex[0]->getDef(d); 883 for (l = 1; l < 4; ++l) { 884 for (d = 0; tex[l]->defExists(d); ++d) { 885 res[l][d] = cloneShallow(func, res[0][d]); 886 bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags); 887 } 888 } 889 890 for (d = 0; i->defExists(d); ++d) { 891 Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d)); 892 for (l = 0; l < 4; ++l) 893 dst->setSrc(l, res[l][d]); 894 } 895 delete_Instruction(prog, i); 896 return true; 897 } 898 899 // LOD must be equal for all threads of a quad. 900 // Unlike with TXB, here we can just diverge since there's no LOD calculation 901 // that would require all 4 threads' sources to be set up properly. 902 bool 903 NV50LoweringPreSSA::handleTXL(TexInstruction *i) 904 { 905 handleTEX(i); 906 Value *lod = i->getSrc(i->tex.target.getArgCount()); 907 if (lod->isUniform()) 908 return true; 909 910 BasicBlock *currBB = i->bb; 911 BasicBlock *texiBB = i->bb->splitBefore(i, false); 912 BasicBlock *joinBB = i->bb->splitAfter(i); 913 914 bld.setPosition(currBB, true); 915 assert(!currBB->joinAt); 916 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); 917 918 for (int l = 0; l <= 3; ++l) { 919 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR); 920 Value *pred = bld.getScratch(1, FILE_FLAGS); 921 bld.setPosition(currBB, true); 922 bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0; 923 bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1; 924 currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD); 925 if (l <= 2) { 926 BasicBlock *laneBB = new BasicBlock(func); 927 currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE); 928 currBB = laneBB; 929 } 930 } 931 bld.setPosition(joinBB, false); 932 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; 933 return true; 934 } 935 936 bool 937 NV50LoweringPreSSA::handleTXD(TexInstruction *i) 938 { 939 static const uint8_t qOps[4][2] = 940 { 941 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 942 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 943 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 944 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 945 }; 946 Value *def[4][4]; 947 Value *crd[3]; 948 Instruction *tex; 949 Value *zero = bld.loadImm(bld.getSSA(), 0); 950 int l, c; 951 const int dim = i->tex.target.getDim() + i->tex.target.isCube(); 952 953 handleTEX(i); 954 i->op = OP_TEX; // no need to clone dPdx/dPdy later 955 i->tex.derivAll = true; 956 957 for (c = 0; c < dim; ++c) 958 crd[c] = bld.getScratch(); 959 960 bld.mkOp(OP_QUADON, TYPE_NONE, NULL); 961 for (l = 0; l < 4; ++l) { 962 Value *src[3], *val; 963 // mov coordinates from lane l to all lanes 964 for (c = 0; c < dim; ++c) 965 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); 966 // add dPdx from lane l to lanes dx 967 for (c = 0; c < dim; ++c) 968 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]); 969 // add dPdy from lane l to lanes dy 970 for (c = 0; c < dim; ++c) 971 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); 972 // normalize cube coordinates if necessary 973 if (i->tex.target.isCube()) { 974 for (c = 0; c < 3; ++c) 975 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); 976 val = bld.getScratch(); 977 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 978 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 979 bld.mkOp1(OP_RCP, TYPE_F32, val, val); 980 for (c = 0; c < 3; ++c) 981 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); 982 } else { 983 for (c = 0; c < dim; ++c) 984 src[c] = crd[c]; 985 } 986 // texture 987 bld.insert(tex = cloneForward(func, i)); 988 for (c = 0; c < dim; ++c) 989 tex->setSrc(c, src[c]); 990 // save results 991 for (c = 0; i->defExists(c); ++c) { 992 Instruction *mov; 993 def[c][l] = bld.getSSA(); 994 mov = bld.mkMov(def[c][l], tex->getDef(c)); 995 mov->fixed = 1; 996 mov->lanes = 1 << l; 997 } 998 } 999 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); 1000 1001 for (c = 0; i->defExists(c); ++c) { 1002 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); 1003 for (l = 0; l < 4; ++l) 1004 u->setSrc(l, def[c][l]); 1005 } 1006 1007 i->bb->remove(i); 1008 return true; 1009 } 1010 1011 bool 1012 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i) 1013 { 1014 handleTEX(i); 1015 bld.setPosition(i, true); 1016 1017 /* The returned values are not quite what we want: 1018 * (a) convert from s32 to f32 1019 * (b) multiply by 1/256 1020 */ 1021 for (int def = 0; def < 2; ++def) { 1022 if (!i->defExists(def)) 1023 continue; 1024 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def)); 1025 bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def), 1026 i->getDef(def), bld.loadImm(NULL, 1.0f / 256)); 1027 } 1028 return true; 1029 } 1030 1031 bool 1032 NV50LoweringPreSSA::handleTXQ(TexInstruction *i) 1033 { 1034 Value *ms, *ms_x, *ms_y; 1035 if (i->tex.query == TXQ_DIMS) 1036 return true; 1037 assert(i->tex.query == TXQ_TYPE); 1038 assert(i->tex.mask == 4); 1039 1040 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y); 1041 bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms); 1042 i->bb->remove(i); 1043 1044 return true; 1045 } 1046 1047 1048 bool 1049 NV50LoweringPreSSA::handleSET(Instruction *i) 1050 { 1051 if (i->dType == TYPE_F32) { 1052 bld.setPosition(i, true); 1053 i->dType = TYPE_U32; 1054 bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0)); 1055 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0)); 1056 } 1057 return true; 1058 } 1059 1060 bool 1061 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i) 1062 { 1063 Value *src0 = bld.getSSA(); 1064 Value *src1 = bld.getSSA(); 1065 Value *pred = bld.getScratch(1, FILE_FLAGS); 1066 1067 Value *v0 = i->getSrc(0); 1068 Value *v1 = i->getSrc(1); 1069 // XXX: these probably shouldn't be immediates in the first place ... 1070 if (v0->asImm()) 1071 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); 1072 if (v1->asImm()) 1073 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); 1074 1075 bld.setPosition(i, true); 1076 bld.mkMov(src0, v0)->setPredicate(CC_NE, pred); 1077 bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred); 1078 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); 1079 1080 bld.setPosition(i, false); 1081 i->op = OP_SET; 1082 i->setFlagsDef(0, pred); 1083 i->dType = TYPE_U8; 1084 i->setSrc(0, i->getSrc(2)); 1085 i->setSrc(2, NULL); 1086 i->setSrc(1, bld.loadImm(NULL, 0)); 1087 1088 return true; 1089 } 1090 1091 bool 1092 NV50LoweringPreSSA::handleSELP(Instruction *i) 1093 { 1094 Value *src0 = bld.getSSA(); 1095 Value *src1 = bld.getSSA(); 1096 1097 Value *v0 = i->getSrc(0); 1098 Value *v1 = i->getSrc(1); 1099 if (v0->asImm()) 1100 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0); 1101 if (v1->asImm()) 1102 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0); 1103 1104 bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2)); 1105 bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2)); 1106 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1); 1107 delete_Instruction(prog, i); 1108 return true; 1109 } 1110 1111 bool 1112 NV50LoweringPreSSA::handleWRSV(Instruction *i) 1113 { 1114 Symbol *sym = i->getSrc(0)->asSym(); 1115 1116 // these are all shader outputs, $sreg are not writeable 1117 uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym); 1118 if (addr >= 0x400) 1119 return false; 1120 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); 1121 1122 bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1)); 1123 1124 bld.getBB()->remove(i); 1125 return true; 1126 } 1127 1128 bool 1129 NV50LoweringPreSSA::handleCALL(Instruction *i) 1130 { 1131 if (prog->getType() == Program::TYPE_COMPUTE) { 1132 // Add implicit "thread id" argument in $r0 to the function 1133 i->setSrc(i->srcCount(), tid); 1134 } 1135 return true; 1136 } 1137 1138 bool 1139 NV50LoweringPreSSA::handlePRECONT(Instruction *i) 1140 { 1141 delete_Instruction(prog, i); 1142 return true; 1143 } 1144 1145 bool 1146 NV50LoweringPreSSA::handleCONT(Instruction *i) 1147 { 1148 i->op = OP_BRA; 1149 return true; 1150 } 1151 1152 bool 1153 NV50LoweringPreSSA::handleRDSV(Instruction *i) 1154 { 1155 Symbol *sym = i->getSrc(0)->asSym(); 1156 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); 1157 Value *def = i->getDef(0); 1158 SVSemantic sv = sym->reg.data.sv.sv; 1159 int idx = sym->reg.data.sv.index; 1160 1161 if (addr >= 0x400) // mov $sreg 1162 return true; 1163 1164 switch (sv) { 1165 case SV_POSITION: 1166 assert(prog->getType() == Program::TYPE_FRAGMENT); 1167 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL); 1168 break; 1169 case SV_FACE: 1170 bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL); 1171 if (i->dType == TYPE_F32) { 1172 bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001)); 1173 bld.mkOp1(OP_NEG, TYPE_S32, def, def); 1174 bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def); 1175 } 1176 break; 1177 case SV_NCTAID: 1178 case SV_CTAID: 1179 case SV_NTID: 1180 if ((sv == SV_NCTAID && idx >= 2) || 1181 (sv == SV_NTID && idx >= 3)) { 1182 bld.mkMov(def, bld.mkImm(1)); 1183 } else if (sv == SV_CTAID && idx >= 2) { 1184 bld.mkMov(def, bld.mkImm(0)); 1185 } else { 1186 Value *x = bld.getSSA(2); 1187 bld.mkOp1(OP_LOAD, TYPE_U16, x, 1188 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr)); 1189 bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x); 1190 } 1191 break; 1192 case SV_TID: 1193 if (idx == 0) { 1194 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff)); 1195 } else if (idx == 1) { 1196 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000)); 1197 bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16)); 1198 } else if (idx == 2) { 1199 bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26)); 1200 } else { 1201 bld.mkMov(def, bld.mkImm(0)); 1202 } 1203 break; 1204 case SV_SAMPLE_POS: { 1205 Value *off = new_LValue(func, FILE_ADDRESS); 1206 bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0)); 1207 bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3)); 1208 bld.mkLoad(TYPE_F32, 1209 def, 1210 bld.mkSymbol( 1211 FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, 1212 TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx), 1213 off); 1214 break; 1215 } 1216 default: 1217 bld.mkFetch(i->getDef(0), i->dType, 1218 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL); 1219 break; 1220 } 1221 bld.getBB()->remove(i); 1222 return true; 1223 } 1224 1225 bool 1226 NV50LoweringPreSSA::handleDIV(Instruction *i) 1227 { 1228 if (!isFloatType(i->dType)) 1229 return true; 1230 bld.setPosition(i, false); 1231 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1)); 1232 i->op = OP_MUL; 1233 i->setSrc(1, rcp->getDef(0)); 1234 return true; 1235 } 1236 1237 bool 1238 NV50LoweringPreSSA::handleSQRT(Instruction *i) 1239 { 1240 bld.setPosition(i, true); 1241 i->op = OP_RSQ; 1242 bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0)); 1243 1244 return true; 1245 } 1246 1247 bool 1248 NV50LoweringPreSSA::handlePOW(Instruction *i) 1249 { 1250 LValue *val = bld.getScratch(); 1251 1252 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); 1253 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; 1254 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); 1255 1256 i->op = OP_EX2; 1257 i->setSrc(0, val); 1258 i->setSrc(1, NULL); 1259 1260 return true; 1261 } 1262 1263 bool 1264 NV50LoweringPreSSA::handleEXPORT(Instruction *i) 1265 { 1266 if (prog->getType() == Program::TYPE_FRAGMENT) { 1267 if (i->getIndirect(0, 0)) { 1268 // TODO: redirect to l[] here, load to GPRs at exit 1269 return false; 1270 } else { 1271 int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units 1272 1273 i->op = OP_MOV; 1274 i->subOp = NV50_IR_SUBOP_MOV_FINAL; 1275 i->src(0).set(i->src(1)); 1276 i->setSrc(1, NULL); 1277 i->setDef(0, new_LValue(func, FILE_GPR)); 1278 i->getDef(0)->reg.data.id = id; 1279 1280 prog->maxGPR = MAX2(prog->maxGPR, id * 2); 1281 } 1282 } 1283 return true; 1284 } 1285 1286 // Handle indirect addressing in geometry shaders: 1287 // 1288 // ld $r0 a[$a1][$a2+k] -> 1289 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit 1290 // 1291 bool 1292 NV50LoweringPreSSA::handleLOAD(Instruction *i) 1293 { 1294 ValueRef src = i->src(0); 1295 1296 if (src.isIndirect(1)) { 1297 assert(prog->getType() == Program::TYPE_GEOMETRY); 1298 Value *addr = i->getIndirect(0, 1); 1299 1300 if (src.isIndirect(0)) { 1301 // base address is in an address register, so move to a GPR 1302 Value *base = bld.getScratch(); 1303 bld.mkMov(base, addr); 1304 1305 Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0); 1306 Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv); 1307 Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), 1308 i->getIndirect(0, 0), bld.mkImm(2)); 1309 1310 // Calculate final address: addr = base + attr*vstride; use 16-bit 1311 // multiplication since 32-bit would be lowered to multiple 1312 // instructions, and we only need the low 16 bits of the result 1313 Value *a[2], *b[2]; 1314 bld.mkSplit(a, 2, attrib); 1315 bld.mkSplit(b, 2, vstride); 1316 Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0], 1317 base); 1318 1319 // move address from GPR into an address register 1320 addr = bld.getSSA(2, FILE_ADDRESS); 1321 bld.mkMov(addr, sum); 1322 } 1323 1324 i->setIndirect(0, 1, NULL); 1325 i->setIndirect(0, 0, addr); 1326 } 1327 1328 return true; 1329 } 1330 1331 bool 1332 NV50LoweringPreSSA::handlePFETCH(Instruction *i) 1333 { 1334 assert(prog->getType() == Program::TYPE_GEOMETRY); 1335 1336 // NOTE: cannot use getImmediate here, not in SSA form yet, move to 1337 // later phase if that assertion ever triggers: 1338 1339 ImmediateValue *imm = i->getSrc(0)->asImm(); 1340 assert(imm); 1341 1342 assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens 1343 1344 if (i->srcExists(1)) { 1345 // indirect addressing of vertex in primitive space 1346 1347 LValue *val = bld.getScratch(); 1348 Value *ptr = bld.getSSA(2, FILE_ADDRESS); 1349 bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2)); 1350 bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr); 1351 1352 // NOTE: PFETCH directly to an $aX only works with direct addressing 1353 i->op = OP_SHL; 1354 i->setSrc(0, val); 1355 i->setSrc(1, bld.mkImm(0)); 1356 } 1357 1358 return true; 1359 } 1360 1361 // Set flags according to predicate and make the instruction read $cX. 1362 void 1363 NV50LoweringPreSSA::checkPredicate(Instruction *insn) 1364 { 1365 Value *pred = insn->getPredicate(); 1366 Value *cdst; 1367 1368 // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA 1369 if (!pred || 1370 pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE) 1371 return; 1372 1373 cdst = bld.getSSA(1, FILE_FLAGS); 1374 1375 bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred); 1376 1377 insn->setPredicate(insn->cc, cdst); 1378 } 1379 1380 // 1381 // - add quadop dance for texturing 1382 // - put FP outputs in GPRs 1383 // - convert instruction sequences 1384 // 1385 bool 1386 NV50LoweringPreSSA::visit(Instruction *i) 1387 { 1388 bld.setPosition(i, false); 1389 1390 if (i->cc != CC_ALWAYS) 1391 checkPredicate(i); 1392 1393 switch (i->op) { 1394 case OP_TEX: 1395 case OP_TXF: 1396 case OP_TXG: 1397 return handleTEX(i->asTex()); 1398 case OP_TXB: 1399 return handleTXB(i->asTex()); 1400 case OP_TXL: 1401 return handleTXL(i->asTex()); 1402 case OP_TXD: 1403 return handleTXD(i->asTex()); 1404 case OP_TXLQ: 1405 return handleTXLQ(i->asTex()); 1406 case OP_TXQ: 1407 return handleTXQ(i->asTex()); 1408 case OP_EX2: 1409 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); 1410 i->setSrc(0, i->getDef(0)); 1411 break; 1412 case OP_SET: 1413 return handleSET(i); 1414 case OP_SLCT: 1415 return handleSLCT(i->asCmp()); 1416 case OP_SELP: 1417 return handleSELP(i); 1418 case OP_POW: 1419 return handlePOW(i); 1420 case OP_DIV: 1421 return handleDIV(i); 1422 case OP_SQRT: 1423 return handleSQRT(i); 1424 case OP_EXPORT: 1425 return handleEXPORT(i); 1426 case OP_LOAD: 1427 return handleLOAD(i); 1428 case OP_RDSV: 1429 return handleRDSV(i); 1430 case OP_WRSV: 1431 return handleWRSV(i); 1432 case OP_CALL: 1433 return handleCALL(i); 1434 case OP_PRECONT: 1435 return handlePRECONT(i); 1436 case OP_CONT: 1437 return handleCONT(i); 1438 case OP_PFETCH: 1439 return handlePFETCH(i); 1440 default: 1441 break; 1442 } 1443 return true; 1444 } 1445 1446 bool 1447 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const 1448 { 1449 bool ret = false; 1450 1451 if (stage == CG_STAGE_PRE_SSA) { 1452 NV50LoweringPreSSA pass(prog); 1453 ret = pass.run(prog, false, true); 1454 } else 1455 if (stage == CG_STAGE_SSA) { 1456 if (!prog->targetPriv) 1457 prog->targetPriv = new std::list<Instruction *>(); 1458 NV50LegalizeSSA pass(prog); 1459 ret = pass.run(prog, false, true); 1460 } else 1461 if (stage == CG_STAGE_POST_RA) { 1462 NV50LegalizePostRA pass; 1463 ret = pass.run(prog, false, true); 1464 if (prog->targetPriv) 1465 delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv); 1466 } 1467 return ret; 1468 } 1469 1470 } // namespace nv50_ir 1471