1 /* 2 * Copyright 2010 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "nv50/nv50_program.h" 24 #include "nv50/nv50_context.h" 25 26 #include "codegen/nv50_ir_driver.h" 27 28 static inline unsigned 29 bitcount4(const uint32_t val) 30 { 31 static const uint8_t cnt[16] 32 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 33 return cnt[val & 0xf]; 34 } 35 36 static int 37 nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info) 38 { 39 struct nv50_program *prog = (struct nv50_program *)info->driverPriv; 40 unsigned i, n, c; 41 42 n = 0; 43 for (i = 0; i < info->numInputs; ++i) { 44 prog->in[i].id = i; 45 prog->in[i].sn = info->in[i].sn; 46 prog->in[i].si = info->in[i].si; 47 prog->in[i].hw = n; 48 prog->in[i].mask = info->in[i].mask; 49 50 prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32); 51 52 for (c = 0; c < 4; ++c) 53 if (info->in[i].mask & (1 << c)) 54 info->in[i].slot[c] = n++; 55 56 if (info->in[i].sn == TGSI_SEMANTIC_PRIMID) 57 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID; 58 } 59 prog->in_nr = info->numInputs; 60 61 for (i = 0; i < info->numSysVals; ++i) { 62 switch (info->sv[i].sn) { 63 case TGSI_SEMANTIC_INSTANCEID: 64 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID; 65 continue; 66 case TGSI_SEMANTIC_VERTEXID: 67 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID; 68 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START; 69 continue; 70 default: 71 break; 72 } 73 } 74 75 /* 76 * Corner case: VP has no inputs, but we will still need to submit data to 77 * draw it. HW will shout at us and won't draw anything if we don't enable 78 * any input, so let's just pretend it's the first one. 79 */ 80 if (prog->vp.attrs[0] == 0 && 81 prog->vp.attrs[1] == 0 && 82 prog->vp.attrs[2] == 0) 83 prog->vp.attrs[0] |= 0xf; 84 85 /* VertexID before InstanceID */ 86 if (info->io.vertexId < info->numSysVals) 87 info->sv[info->io.vertexId].slot[0] = n++; 88 if (info->io.instanceId < info->numSysVals) 89 info->sv[info->io.instanceId].slot[0] = n++; 90 91 n = 0; 92 for (i = 0; i < info->numOutputs; ++i) { 93 switch (info->out[i].sn) { 94 case TGSI_SEMANTIC_PSIZE: 95 prog->vp.psiz = i; 96 break; 97 case TGSI_SEMANTIC_CLIPDIST: 98 prog->vp.clpd[info->out[i].si] = n; 99 break; 100 case TGSI_SEMANTIC_EDGEFLAG: 101 prog->vp.edgeflag = i; 102 break; 103 case TGSI_SEMANTIC_BCOLOR: 104 prog->vp.bfc[info->out[i].si] = i; 105 break; 106 case TGSI_SEMANTIC_LAYER: 107 prog->gp.has_layer = true; 108 prog->gp.layerid = n; 109 break; 110 case TGSI_SEMANTIC_VIEWPORT_INDEX: 111 prog->gp.has_viewport = true; 112 prog->gp.viewportid = n; 113 break; 114 default: 115 break; 116 } 117 prog->out[i].id = i; 118 prog->out[i].sn = info->out[i].sn; 119 prog->out[i].si = info->out[i].si; 120 prog->out[i].hw = n; 121 prog->out[i].mask = info->out[i].mask; 122 123 for (c = 0; c < 4; ++c) 124 if (info->out[i].mask & (1 << c)) 125 info->out[i].slot[c] = n++; 126 } 127 prog->out_nr = info->numOutputs; 128 prog->max_out = n; 129 if (!prog->max_out) 130 prog->max_out = 1; 131 132 if (prog->vp.psiz < info->numOutputs) 133 prog->vp.psiz = prog->out[prog->vp.psiz].hw; 134 135 return 0; 136 } 137 138 static int 139 nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info) 140 { 141 struct nv50_program *prog = (struct nv50_program *)info->driverPriv; 142 unsigned i, n, m, c; 143 unsigned nvary; 144 unsigned nflat; 145 unsigned nintp = 0; 146 147 /* count recorded non-flat inputs */ 148 for (m = 0, i = 0; i < info->numInputs; ++i) { 149 switch (info->in[i].sn) { 150 case TGSI_SEMANTIC_POSITION: 151 continue; 152 default: 153 m += info->in[i].flat ? 0 : 1; 154 break; 155 } 156 } 157 /* careful: id may be != i in info->in[prog->in[i].id] */ 158 159 /* Fill prog->in[] so that non-flat inputs are first and 160 * kick out special inputs that don't use the RESULT_MAP. 161 */ 162 for (n = 0, i = 0; i < info->numInputs; ++i) { 163 if (info->in[i].sn == TGSI_SEMANTIC_POSITION) { 164 prog->fp.interp |= info->in[i].mask << 24; 165 for (c = 0; c < 4; ++c) 166 if (info->in[i].mask & (1 << c)) 167 info->in[i].slot[c] = nintp++; 168 } else { 169 unsigned j = info->in[i].flat ? m++ : n++; 170 171 if (info->in[i].sn == TGSI_SEMANTIC_COLOR) 172 prog->vp.bfc[info->in[i].si] = j; 173 else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID) 174 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID; 175 176 prog->in[j].id = i; 177 prog->in[j].mask = info->in[i].mask; 178 prog->in[j].sn = info->in[i].sn; 179 prog->in[j].si = info->in[i].si; 180 prog->in[j].linear = info->in[i].linear; 181 182 prog->in_nr++; 183 } 184 } 185 if (!(prog->fp.interp & (8 << 24))) { 186 ++nintp; 187 prog->fp.interp |= 8 << 24; 188 } 189 190 for (i = 0; i < prog->in_nr; ++i) { 191 int j = prog->in[i].id; 192 193 prog->in[i].hw = nintp; 194 for (c = 0; c < 4; ++c) 195 if (prog->in[i].mask & (1 << c)) 196 info->in[j].slot[c] = nintp++; 197 } 198 /* (n == m) if m never increased, i.e. no flat inputs */ 199 nflat = (n < m) ? (nintp - prog->in[n].hw) : 0; 200 nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */ 201 nvary = nintp - nflat; 202 203 prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT; 204 prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT; 205 206 /* put front/back colors right after HPOS */ 207 prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT; 208 for (i = 0; i < 2; ++i) 209 if (prog->vp.bfc[i] < 0xff) 210 prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16; 211 212 /* FP outputs */ 213 214 if (info->prop.fp.numColourResults > 1) 215 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS; 216 217 for (i = 0; i < info->numOutputs; ++i) { 218 prog->out[i].id = i; 219 prog->out[i].sn = info->out[i].sn; 220 prog->out[i].si = info->out[i].si; 221 prog->out[i].mask = info->out[i].mask; 222 223 if (i == info->io.fragDepth || i == info->io.sampleMask) 224 continue; 225 prog->out[i].hw = info->out[i].si * 4; 226 227 for (c = 0; c < 4; ++c) 228 info->out[i].slot[c] = prog->out[i].hw + c; 229 230 prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4); 231 } 232 233 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) { 234 info->out[info->io.sampleMask].slot[0] = prog->max_out++; 235 prog->fp.has_samplemask = 1; 236 } 237 238 if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS) 239 info->out[info->io.fragDepth].slot[2] = prog->max_out++; 240 241 if (!prog->max_out) 242 prog->max_out = 4; 243 244 return 0; 245 } 246 247 static int 248 nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info) 249 { 250 switch (info->type) { 251 case PIPE_SHADER_VERTEX: 252 return nv50_vertprog_assign_slots(info); 253 case PIPE_SHADER_GEOMETRY: 254 return nv50_vertprog_assign_slots(info); 255 case PIPE_SHADER_FRAGMENT: 256 return nv50_fragprog_assign_slots(info); 257 case PIPE_SHADER_COMPUTE: 258 return 0; 259 default: 260 return -1; 261 } 262 } 263 264 static struct nv50_stream_output_state * 265 nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info, 266 const struct pipe_stream_output_info *pso) 267 { 268 struct nv50_stream_output_state *so; 269 unsigned b, i, c; 270 unsigned base[4]; 271 272 so = MALLOC_STRUCT(nv50_stream_output_state); 273 if (!so) 274 return NULL; 275 memset(so->map, 0xff, sizeof(so->map)); 276 277 for (b = 0; b < 4; ++b) 278 so->num_attribs[b] = 0; 279 for (i = 0; i < pso->num_outputs; ++i) { 280 unsigned end = pso->output[i].dst_offset + pso->output[i].num_components; 281 b = pso->output[i].output_buffer; 282 assert(b < 4); 283 so->num_attribs[b] = MAX2(so->num_attribs[b], end); 284 } 285 286 so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED; 287 288 so->stride[0] = pso->stride[0] * 4; 289 base[0] = 0; 290 for (b = 1; b < 4; ++b) { 291 assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]); 292 so->stride[b] = so->num_attribs[b] * 4; 293 if (so->num_attribs[b]) 294 so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT; 295 base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4); 296 } 297 if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) { 298 assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX); 299 so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT; 300 } 301 302 so->map_size = base[3] + so->num_attribs[3]; 303 304 for (i = 0; i < pso->num_outputs; ++i) { 305 const unsigned s = pso->output[i].start_component; 306 const unsigned p = pso->output[i].dst_offset; 307 const unsigned r = pso->output[i].register_index; 308 b = pso->output[i].output_buffer; 309 310 if (r >= info->numOutputs) 311 continue; 312 313 for (c = 0; c < pso->output[i].num_components; ++c) 314 so->map[base[b] + p + c] = info->out[r].slot[s + c]; 315 } 316 317 return so; 318 } 319 320 bool 321 nv50_program_translate(struct nv50_program *prog, uint16_t chipset, 322 struct pipe_debug_callback *debug) 323 { 324 struct nv50_ir_prog_info *info; 325 int i, ret; 326 const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80; 327 328 info = CALLOC_STRUCT(nv50_ir_prog_info); 329 if (!info) 330 return false; 331 332 info->type = prog->type; 333 info->target = chipset; 334 info->bin.sourceRep = NV50_PROGRAM_IR_TGSI; 335 info->bin.source = (void *)prog->pipe.tokens; 336 337 info->io.auxCBSlot = 15; 338 info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET; 339 info->io.genUserClip = prog->vp.clpd_nr; 340 if (prog->fp.alphatest) 341 info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET; 342 343 info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET; 344 info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET; 345 info->io.msInfoCBSlot = 15; 346 info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET; 347 348 info->assignSlots = nv50_program_assign_varying_slots; 349 350 prog->vp.bfc[0] = 0xff; 351 prog->vp.bfc[1] = 0xff; 352 prog->vp.edgeflag = 0xff; 353 prog->vp.clpd[0] = map_undef; 354 prog->vp.clpd[1] = map_undef; 355 prog->vp.psiz = map_undef; 356 prog->gp.has_layer = 0; 357 prog->gp.has_viewport = 0; 358 359 if (prog->type == PIPE_SHADER_COMPUTE) 360 info->prop.cp.inputOffset = 0x10; 361 362 info->driverPriv = prog; 363 364 #ifdef DEBUG 365 info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3); 366 info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0); 367 #else 368 info->optLevel = 3; 369 #endif 370 371 ret = nv50_ir_generate_code(info); 372 if (ret) { 373 NOUVEAU_ERR("shader translation failed: %i\n", ret); 374 goto out; 375 } 376 377 prog->code = info->bin.code; 378 prog->code_size = info->bin.codeSize; 379 prog->fixups = info->bin.relocData; 380 prog->interps = info->bin.fixupData; 381 prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1); 382 prog->tls_space = info->bin.tlsSpace; 383 384 prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS; 385 386 prog->vp.clip_enable = (1 << info->io.clipDistances) - 1; 387 prog->vp.cull_enable = 388 ((1 << info->io.cullDistances) - 1) << info->io.clipDistances; 389 prog->vp.clip_mode = 0; 390 for (i = 0; i < info->io.cullDistances; ++i) 391 prog->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4); 392 393 if (prog->type == PIPE_SHADER_FRAGMENT) { 394 if (info->prop.fp.writesDepth) { 395 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z; 396 prog->fp.flags[1] = 0x11; 397 } 398 if (info->prop.fp.usesDiscard) 399 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL; 400 } else 401 if (prog->type == PIPE_SHADER_GEOMETRY) { 402 switch (info->prop.gp.outputPrim) { 403 case PIPE_PRIM_LINE_STRIP: 404 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP; 405 break; 406 case PIPE_PRIM_TRIANGLE_STRIP: 407 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP; 408 break; 409 case PIPE_PRIM_POINTS: 410 default: 411 assert(info->prop.gp.outputPrim == PIPE_PRIM_POINTS); 412 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS; 413 break; 414 } 415 prog->gp.vert_count = CLAMP(info->prop.gp.maxVertices, 1, 1024); 416 } 417 418 if (prog->type == PIPE_SHADER_COMPUTE) { 419 prog->cp.syms = info->bin.syms; 420 prog->cp.num_syms = info->bin.numSyms; 421 } else { 422 FREE(info->bin.syms); 423 } 424 425 if (prog->pipe.stream_output.num_outputs) 426 prog->so = nv50_program_create_strmout_state(info, 427 &prog->pipe.stream_output); 428 429 pipe_debug_message(debug, SHADER_INFO, 430 "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d", 431 prog->type, info->bin.tlsSpace, prog->max_gpr, 432 info->bin.instructions, info->bin.codeSize); 433 434 out: 435 FREE(info); 436 return !ret; 437 } 438 439 bool 440 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) 441 { 442 struct nouveau_heap *heap; 443 int ret; 444 uint32_t size = align(prog->code_size, 0x40); 445 uint8_t prog_type; 446 447 switch (prog->type) { 448 case PIPE_SHADER_VERTEX: heap = nv50->screen->vp_code_heap; break; 449 case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break; 450 case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break; 451 case PIPE_SHADER_COMPUTE: heap = nv50->screen->fp_code_heap; break; 452 default: 453 assert(!"invalid program type"); 454 return false; 455 } 456 457 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); 458 if (ret) { 459 /* Out of space: evict everything to compactify the code segment, hoping 460 * the working set is much smaller and drifts slowly. Improve me ! 461 */ 462 while (heap->next) { 463 struct nv50_program *evict = heap->next->priv; 464 if (evict) 465 nouveau_heap_free(&evict->mem); 466 } 467 debug_printf("WARNING: out of code space, evicting all shaders.\n"); 468 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); 469 if (ret) { 470 NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size); 471 return false; 472 } 473 } 474 475 if (prog->type == PIPE_SHADER_COMPUTE) { 476 /* CP code must be uploaded in FP code segment. */ 477 prog_type = 1; 478 } else { 479 prog->code_base = prog->mem->start; 480 prog_type = prog->type; 481 } 482 483 ret = nv50_tls_realloc(nv50->screen, prog->tls_space); 484 if (ret < 0) { 485 nouveau_heap_free(&prog->mem); 486 return false; 487 } 488 if (ret > 0) 489 nv50->state.new_tls_space = true; 490 491 if (prog->fixups) 492 nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0); 493 if (prog->interps) 494 nv50_ir_apply_fixups(prog->interps, prog->code, 495 prog->fp.force_persample_interp, 496 false /* flatshade */, 497 prog->fp.alphatest - 1); 498 499 nv50_sifc_linear_u8(&nv50->base, nv50->screen->code, 500 (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base, 501 NOUVEAU_BO_VRAM, prog->code_size, prog->code); 502 503 BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1); 504 PUSH_DATA (nv50->base.pushbuf, 0); 505 506 return true; 507 } 508 509 void 510 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 511 { 512 const struct pipe_shader_state pipe = p->pipe; 513 const ubyte type = p->type; 514 515 if (p->mem) 516 nouveau_heap_free(&p->mem); 517 518 FREE(p->code); 519 520 FREE(p->fixups); 521 FREE(p->interps); 522 FREE(p->so); 523 524 if (type == PIPE_SHADER_COMPUTE) 525 FREE(p->cp.syms); 526 527 memset(p, 0, sizeof(*p)); 528 529 p->pipe = pipe; 530 p->type = type; 531 } 532