1 /* 2 * Copyright 2011 Christoph Bumiller 3 * 2014 Red Hat Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "codegen/nv50_ir.h" 25 #include "codegen/nv50_ir_build_util.h" 26 27 #include "codegen/nv50_ir_target_nvc0.h" 28 #include "codegen/nv50_ir_lowering_gm107.h" 29 30 #include <limits> 31 32 namespace nv50_ir { 33 34 #define QOP_ADD 0 35 #define QOP_SUBR 1 36 #define QOP_SUB 2 37 #define QOP_MOV2 3 38 39 // UL UR LL LR 40 #define QUADOP(q, r, s, t) \ 41 ((QOP_##q << 6) | (QOP_##r << 4) | \ 42 (QOP_##s << 2) | (QOP_##t << 0)) 43 44 #define SHFL_BOUND_QUAD 0x1c03 45 46 void 47 GM107LegalizeSSA::handlePFETCH(Instruction *i) 48 { 49 Value *src0; 50 51 if (i->src(0).getFile() == FILE_GPR && !i->srcExists(1)) 52 return; 53 54 bld.setPosition(i, false); 55 src0 = bld.getSSA(); 56 57 if (i->srcExists(1)) 58 bld.mkOp2(OP_ADD , TYPE_U32, src0, i->getSrc(0), i->getSrc(1)); 59 else 60 bld.mkOp1(OP_MOV , TYPE_U32, src0, i->getSrc(0)); 61 62 i->setSrc(0, src0); 63 i->setSrc(1, NULL); 64 } 65 66 void 67 GM107LegalizeSSA::handleLOAD(Instruction *i) 68 { 69 if (i->src(0).getFile() != FILE_MEMORY_CONST) 70 return; 71 if (i->src(0).isIndirect(0)) 72 return; 73 if (typeSizeof(i->dType) != 4) 74 return; 75 76 i->op = OP_MOV; 77 } 78 79 bool 80 GM107LegalizeSSA::visit(Instruction *i) 81 { 82 switch (i->op) { 83 case OP_PFETCH: 84 handlePFETCH(i); 85 break; 86 case OP_LOAD: 87 handleLOAD(i); 88 break; 89 default: 90 break; 91 } 92 return true; 93 } 94 95 bool 96 GM107LoweringPass::handleManualTXD(TexInstruction *i) 97 { 98 // See NVC0LoweringPass::handleManualTXD for rationale. This function 99 // implements the same logic, but using SM50-friendly primitives. 100 static const uint8_t qOps[2] = 101 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }; 102 Value *def[4][4]; 103 Value *crd[3], *arr, *shadow; 104 Value *tmp; 105 Instruction *tex, *add; 106 Value *quad = bld.mkImm(SHFL_BOUND_QUAD); 107 int l, c; 108 const int dim = i->tex.target.getDim() + i->tex.target.isCube(); 109 const int array = i->tex.target.isArray(); 110 const int indirect = i->tex.rIndirectSrc >= 0; 111 112 i->op = OP_TEX; // no need to clone dPdx/dPdy later 113 114 for (c = 0; c < dim; ++c) 115 crd[c] = bld.getScratch(); 116 arr = bld.getScratch(); 117 shadow = bld.getScratch(); 118 tmp = bld.getScratch(); 119 120 for (l = 0; l < 4; ++l) { 121 Value *src[3], *val; 122 Value *lane = bld.mkImm(l); 123 bld.mkOp(OP_QUADON, TYPE_NONE, NULL); 124 // Make sure lane 0 has the appropriate array/depth compare values 125 if (l != 0) { 126 if (array) 127 bld.mkOp3(OP_SHFL, TYPE_F32, arr, i->getSrc(0), lane, quad); 128 if (i->tex.target.isShadow()) 129 bld.mkOp3(OP_SHFL, TYPE_F32, shadow, i->getSrc(array + dim + indirect), lane, quad); 130 } 131 132 // mov coordinates from lane l to all lanes 133 for (c = 0; c < dim; ++c) { 134 bld.mkOp3(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), lane, quad); 135 } 136 137 // add dPdx from lane l to lanes dx 138 for (c = 0; c < dim; ++c) { 139 bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), lane, quad); 140 add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); 141 add->subOp = qOps[0]; 142 add->lanes = 1; /* abused for .ndv */ 143 } 144 145 // add dPdy from lane l to lanes dy 146 for (c = 0; c < dim; ++c) { 147 bld.mkOp3(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), lane, quad); 148 add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]); 149 add->subOp = qOps[1]; 150 add->lanes = 1; /* abused for .ndv */ 151 } 152 153 // normalize cube coordinates if necessary 154 if (i->tex.target.isCube()) { 155 for (c = 0; c < 3; ++c) 156 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); 157 val = bld.getScratch(); 158 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); 159 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); 160 bld.mkOp1(OP_RCP, TYPE_F32, val, val); 161 for (c = 0; c < 3; ++c) 162 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); 163 } else { 164 for (c = 0; c < dim; ++c) 165 src[c] = crd[c]; 166 } 167 168 // texture 169 bld.insert(tex = cloneForward(func, i)); 170 if (l != 0) { 171 if (array) 172 tex->setSrc(0, arr); 173 if (i->tex.target.isShadow()) 174 tex->setSrc(array + dim + indirect, shadow); 175 } 176 for (c = 0; c < dim; ++c) 177 tex->setSrc(c + array, src[c]); 178 // broadcast results from lane 0 to all lanes 179 if (l != 0) 180 for (c = 0; i->defExists(c); ++c) 181 bld.mkOp3(OP_SHFL, TYPE_F32, tex->getDef(c), tex->getDef(c), bld.mkImm(0), quad); 182 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); 183 184 // save results 185 for (c = 0; i->defExists(c); ++c) { 186 Instruction *mov; 187 def[c][l] = bld.getSSA(); 188 mov = bld.mkMov(def[c][l], tex->getDef(c)); 189 mov->fixed = 1; 190 mov->lanes = 1 << l; 191 } 192 } 193 194 for (c = 0; i->defExists(c); ++c) { 195 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); 196 for (l = 0; l < 4; ++l) 197 u->setSrc(l, def[c][l]); 198 } 199 200 i->bb->remove(i); 201 return true; 202 } 203 204 bool 205 GM107LoweringPass::handleDFDX(Instruction *insn) 206 { 207 Instruction *shfl; 208 int qop = 0, xid = 0; 209 210 switch (insn->op) { 211 case OP_DFDX: 212 qop = QUADOP(SUB, SUBR, SUB, SUBR); 213 xid = 1; 214 break; 215 case OP_DFDY: 216 qop = QUADOP(SUB, SUB, SUBR, SUBR); 217 xid = 2; 218 break; 219 default: 220 assert(!"invalid dfdx opcode"); 221 break; 222 } 223 224 shfl = bld.mkOp3(OP_SHFL, TYPE_F32, bld.getScratch(), insn->getSrc(0), 225 bld.mkImm(xid), bld.mkImm(SHFL_BOUND_QUAD)); 226 shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY; 227 insn->op = OP_QUADOP; 228 insn->subOp = qop; 229 insn->lanes = 0; /* abused for !.ndv */ 230 insn->setSrc(1, insn->getSrc(0)); 231 insn->setSrc(0, shfl->getDef(0)); 232 return true; 233 } 234 235 bool 236 GM107LoweringPass::handlePFETCH(Instruction *i) 237 { 238 Value *tmp0 = bld.getScratch(); 239 Value *tmp1 = bld.getScratch(); 240 Value *tmp2 = bld.getScratch(); 241 bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0)); 242 bld.mkOp2(OP_SHR , TYPE_U32, tmp1, tmp0, bld.mkImm(16)); 243 bld.mkOp2(OP_AND , TYPE_U32, tmp0, tmp0, bld.mkImm(0xff)); 244 bld.mkOp2(OP_AND , TYPE_U32, tmp1, tmp1, bld.mkImm(0xff)); 245 if (i->getSrc(1)) 246 bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1)); 247 else 248 bld.mkOp1(OP_MOV , TYPE_U32, tmp2, i->getSrc(0)); 249 bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2); 250 i->setSrc(0, tmp0); 251 i->setSrc(1, NULL); 252 return true; 253 } 254 255 bool 256 GM107LoweringPass::handlePOPCNT(Instruction *i) 257 { 258 Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(), 259 i->getSrc(0), i->getSrc(1)); 260 i->setSrc(0, tmp); 261 i->setSrc(1, NULL); 262 return true; 263 } 264 265 // 266 // - add quadop dance for texturing 267 // - put FP outputs in GPRs 268 // - convert instruction sequences 269 // 270 bool 271 GM107LoweringPass::visit(Instruction *i) 272 { 273 bld.setPosition(i, false); 274 275 if (i->cc != CC_ALWAYS) 276 checkPredicate(i); 277 278 switch (i->op) { 279 case OP_PFETCH: 280 return handlePFETCH(i); 281 case OP_DFDX: 282 case OP_DFDY: 283 return handleDFDX(i); 284 case OP_POPCNT: 285 return handlePOPCNT(i); 286 default: 287 return NVC0LoweringPass::visit(i); 288 } 289 } 290 291 } // namespace nv50_ir 292