1 /* 2 * Copyright 2010 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "pipe/p_defines.h" 24 25 #include "tgsi/tgsi_ureg.h" 26 27 #include "nvc0/nvc0_context.h" 28 29 #include "codegen/nv50_ir_driver.h" 30 #include "nvc0/nve4_compute.h" 31 32 /* NOTE: Using a[0x270] in FP may cause an error even if we're using less than 33 * 124 scalar varying values. 34 */ 35 static uint32_t 36 nvc0_shader_input_address(unsigned sn, unsigned si) 37 { 38 switch (sn) { 39 case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4; 40 case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4; 41 case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10; 42 case TGSI_SEMANTIC_PRIMID: return 0x060; 43 case TGSI_SEMANTIC_LAYER: return 0x064; 44 case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068; 45 case TGSI_SEMANTIC_PSIZE: return 0x06c; 46 case TGSI_SEMANTIC_POSITION: return 0x070; 47 case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10; 48 case TGSI_SEMANTIC_FOG: return 0x2e8; 49 case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10; 50 case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10; 51 case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10; 52 case TGSI_SEMANTIC_CLIPVERTEX: return 0x270; 53 case TGSI_SEMANTIC_PCOORD: return 0x2e0; 54 case TGSI_SEMANTIC_TESSCOORD: return 0x2f0; 55 case TGSI_SEMANTIC_INSTANCEID: return 0x2f8; 56 case TGSI_SEMANTIC_VERTEXID: return 0x2fc; 57 case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10; 58 default: 59 assert(!"invalid TGSI input semantic"); 60 return ~0; 61 } 62 } 63 64 static uint32_t 65 nvc0_shader_output_address(unsigned sn, unsigned si) 66 { 67 switch (sn) { 68 case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4; 69 case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4; 70 case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10; 71 case TGSI_SEMANTIC_PRIMID: return 0x060; 72 case TGSI_SEMANTIC_LAYER: return 0x064; 73 case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068; 74 case TGSI_SEMANTIC_PSIZE: return 0x06c; 75 case TGSI_SEMANTIC_POSITION: return 0x070; 76 case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10; 77 case TGSI_SEMANTIC_FOG: return 0x2e8; 78 case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10; 79 case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10; 80 case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10; 81 case TGSI_SEMANTIC_CLIPVERTEX: return 0x270; 82 case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10; 83 /* case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0; */ 84 case TGSI_SEMANTIC_EDGEFLAG: return ~0; 85 default: 86 assert(!"invalid TGSI output semantic"); 87 return ~0; 88 } 89 } 90 91 static int 92 nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info) 93 { 94 unsigned i, c, n; 95 96 for (n = 0, i = 0; i < info->numInputs; ++i) { 97 switch (info->in[i].sn) { 98 case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */ 99 case TGSI_SEMANTIC_VERTEXID: 100 info->in[i].mask = 0x1; 101 info->in[i].slot[0] = 102 nvc0_shader_input_address(info->in[i].sn, 0) / 4; 103 continue; 104 default: 105 break; 106 } 107 for (c = 0; c < 4; ++c) 108 info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4; 109 ++n; 110 } 111 112 return 0; 113 } 114 115 static int 116 nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info) 117 { 118 unsigned offset; 119 unsigned i, c; 120 121 for (i = 0; i < info->numInputs; ++i) { 122 offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si); 123 124 for (c = 0; c < 4; ++c) 125 info->in[i].slot[c] = (offset + c * 0x4) / 4; 126 } 127 128 return 0; 129 } 130 131 static int 132 nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info) 133 { 134 unsigned count = info->prop.fp.numColourResults * 4; 135 unsigned i, c; 136 137 for (i = 0; i < info->numOutputs; ++i) 138 if (info->out[i].sn == TGSI_SEMANTIC_COLOR) 139 for (c = 0; c < 4; ++c) 140 info->out[i].slot[c] = info->out[i].si * 4 + c; 141 142 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) 143 info->out[info->io.sampleMask].slot[0] = count++; 144 else 145 if (info->target >= 0xe0) 146 count++; /* on Kepler, depth is always last colour reg + 2 */ 147 148 if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS) 149 info->out[info->io.fragDepth].slot[2] = count; 150 151 return 0; 152 } 153 154 static int 155 nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info) 156 { 157 unsigned offset; 158 unsigned i, c; 159 160 for (i = 0; i < info->numOutputs; ++i) { 161 offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si); 162 163 for (c = 0; c < 4; ++c) 164 info->out[i].slot[c] = (offset + c * 0x4) / 4; 165 } 166 167 return 0; 168 } 169 170 static int 171 nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info) 172 { 173 int ret; 174 175 if (info->type == PIPE_SHADER_VERTEX) 176 ret = nvc0_vp_assign_input_slots(info); 177 else 178 ret = nvc0_sp_assign_input_slots(info); 179 if (ret) 180 return ret; 181 182 if (info->type == PIPE_SHADER_FRAGMENT) 183 ret = nvc0_fp_assign_output_slots(info); 184 else 185 ret = nvc0_sp_assign_output_slots(info); 186 return ret; 187 } 188 189 static inline void 190 nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot) 191 { 192 uint8_t min = (vp->hdr[4] >> 12) & 0xff; 193 uint8_t max = (vp->hdr[4] >> 24); 194 195 min = MIN2(min, slot); 196 max = MAX2(max, slot); 197 198 vp->hdr[4] = (max << 24) | (min << 12); 199 } 200 201 /* Common part of header generation for VP, TCP, TEP and GP. */ 202 static int 203 nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info) 204 { 205 unsigned i, c, a; 206 207 for (i = 0; i < info->numInputs; ++i) { 208 if (info->in[i].patch) 209 continue; 210 for (c = 0; c < 4; ++c) { 211 a = info->in[i].slot[c]; 212 if (info->in[i].mask & (1 << c)) 213 vp->hdr[5 + a / 32] |= 1 << (a % 32); 214 } 215 } 216 217 for (i = 0; i < info->numOutputs; ++i) { 218 if (info->out[i].patch) 219 continue; 220 for (c = 0; c < 4; ++c) { 221 if (!(info->out[i].mask & (1 << c))) 222 continue; 223 assert(info->out[i].slot[c] >= 0x40 / 4); 224 a = info->out[i].slot[c] - 0x40 / 4; 225 vp->hdr[13 + a / 32] |= 1 << (a % 32); 226 if (info->out[i].oread) 227 nvc0_vtgp_hdr_update_oread(vp, info->out[i].slot[c]); 228 } 229 } 230 231 for (i = 0; i < info->numSysVals; ++i) { 232 switch (info->sv[i].sn) { 233 case TGSI_SEMANTIC_PRIMID: 234 vp->hdr[5] |= 1 << 24; 235 break; 236 case TGSI_SEMANTIC_INSTANCEID: 237 vp->hdr[10] |= 1 << 30; 238 break; 239 case TGSI_SEMANTIC_VERTEXID: 240 vp->hdr[10] |= 1 << 31; 241 break; 242 case TGSI_SEMANTIC_TESSCOORD: 243 /* We don't have the mask, nor the slots populated. While this could 244 * be achieved, the vast majority of the time if either of the coords 245 * are read, then both will be read. 246 */ 247 nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4); 248 nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4); 249 break; 250 default: 251 break; 252 } 253 } 254 255 vp->vp.clip_enable = (1 << info->io.clipDistances) - 1; 256 vp->vp.cull_enable = 257 ((1 << info->io.cullDistances) - 1) << info->io.clipDistances; 258 for (i = 0; i < info->io.cullDistances; ++i) 259 vp->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4); 260 261 if (info->io.genUserClip < 0) 262 vp->vp.num_ucps = PIPE_MAX_CLIP_PLANES + 1; /* prevent rebuilding */ 263 264 return 0; 265 } 266 267 static int 268 nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info) 269 { 270 vp->hdr[0] = 0x20061 | (1 << 10); 271 vp->hdr[4] = 0xff000; 272 273 return nvc0_vtgp_gen_header(vp, info); 274 } 275 276 static void 277 nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info) 278 { 279 if (info->prop.tp.outputPrim == PIPE_PRIM_MAX) { 280 tp->tp.tess_mode = ~0; 281 return; 282 } 283 switch (info->prop.tp.domain) { 284 case PIPE_PRIM_LINES: 285 tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_ISOLINES; 286 break; 287 case PIPE_PRIM_TRIANGLES: 288 tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES; 289 break; 290 case PIPE_PRIM_QUADS: 291 tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS; 292 break; 293 default: 294 tp->tp.tess_mode = ~0; 295 return; 296 } 297 298 /* It seems like lines want the "CW" bit to indicate they're connected, and 299 * spit out errors in dmesg when the "CONNECTED" bit is set. 300 */ 301 if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS) { 302 if (info->prop.tp.domain == PIPE_PRIM_LINES) 303 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; 304 else 305 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED; 306 } 307 308 /* Winding only matters for triangles/quads, not lines. */ 309 if (info->prop.tp.domain != PIPE_PRIM_LINES && 310 info->prop.tp.outputPrim != PIPE_PRIM_POINTS && 311 info->prop.tp.winding > 0) 312 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; 313 314 switch (info->prop.tp.partitioning) { 315 case PIPE_TESS_SPACING_EQUAL: 316 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL; 317 break; 318 case PIPE_TESS_SPACING_FRACTIONAL_ODD: 319 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD; 320 break; 321 case PIPE_TESS_SPACING_FRACTIONAL_EVEN: 322 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN; 323 break; 324 default: 325 assert(!"invalid tessellator partitioning"); 326 break; 327 } 328 } 329 330 static int 331 nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info) 332 { 333 unsigned opcs = 6; /* output patch constants (at least the TessFactors) */ 334 335 tcp->tp.input_patch_size = info->prop.tp.inputPatchSize; 336 337 if (info->numPatchConstants) 338 opcs = 8 + info->numPatchConstants * 4; 339 340 tcp->hdr[0] = 0x20061 | (2 << 10); 341 342 tcp->hdr[1] = opcs << 24; 343 tcp->hdr[2] = info->prop.tp.outputPatchSize << 24; 344 345 tcp->hdr[4] = 0xff000; /* initial min/max parallel output read address */ 346 347 nvc0_vtgp_gen_header(tcp, info); 348 349 if (info->target >= NVISA_GM107_CHIPSET) { 350 /* On GM107+, the number of output patch components has moved in the TCP 351 * header, but it seems like blob still also uses the old position. 352 * Also, the high 8-bits are located inbetween the min/max parallel 353 * field and has to be set after updating the outputs. */ 354 tcp->hdr[3] = (opcs & 0x0f) << 28; 355 tcp->hdr[4] |= (opcs & 0xf0) << 16; 356 } 357 358 nvc0_tp_get_tess_mode(tcp, info); 359 360 return 0; 361 } 362 363 static int 364 nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info) 365 { 366 tep->tp.input_patch_size = ~0; 367 368 tep->hdr[0] = 0x20061 | (3 << 10); 369 tep->hdr[4] = 0xff000; 370 371 nvc0_vtgp_gen_header(tep, info); 372 373 nvc0_tp_get_tess_mode(tep, info); 374 375 tep->hdr[18] |= 0x3 << 12; /* ? */ 376 377 return 0; 378 } 379 380 static int 381 nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info) 382 { 383 gp->hdr[0] = 0x20061 | (4 << 10); 384 385 gp->hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24; 386 387 switch (info->prop.gp.outputPrim) { 388 case PIPE_PRIM_POINTS: 389 gp->hdr[3] = 0x01000000; 390 gp->hdr[0] |= 0xf0000000; 391 break; 392 case PIPE_PRIM_LINE_STRIP: 393 gp->hdr[3] = 0x06000000; 394 gp->hdr[0] |= 0x10000000; 395 break; 396 case PIPE_PRIM_TRIANGLE_STRIP: 397 gp->hdr[3] = 0x07000000; 398 gp->hdr[0] |= 0x10000000; 399 break; 400 default: 401 assert(0); 402 break; 403 } 404 405 gp->hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024); 406 407 return nvc0_vtgp_gen_header(gp, info); 408 } 409 410 #define NVC0_INTERP_FLAT (1 << 0) 411 #define NVC0_INTERP_PERSPECTIVE (2 << 0) 412 #define NVC0_INTERP_LINEAR (3 << 0) 413 #define NVC0_INTERP_CENTROID (1 << 2) 414 415 static uint8_t 416 nvc0_hdr_interp_mode(const struct nv50_ir_varying *var) 417 { 418 if (var->linear) 419 return NVC0_INTERP_LINEAR; 420 if (var->flat) 421 return NVC0_INTERP_FLAT; 422 return NVC0_INTERP_PERSPECTIVE; 423 } 424 425 static int 426 nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info) 427 { 428 unsigned i, c, a, m; 429 430 /* just 00062 on Kepler */ 431 fp->hdr[0] = 0x20062 | (5 << 10); 432 fp->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */ 433 434 if (info->prop.fp.usesDiscard) 435 fp->hdr[0] |= 0x8000; 436 if (info->prop.fp.numColourResults > 1) 437 fp->hdr[0] |= 0x4000; 438 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) 439 fp->hdr[19] |= 0x1; 440 if (info->prop.fp.writesDepth) { 441 fp->hdr[19] |= 0x2; 442 fp->flags[0] = 0x11; /* deactivate ZCULL */ 443 } 444 445 for (i = 0; i < info->numInputs; ++i) { 446 m = nvc0_hdr_interp_mode(&info->in[i]); 447 if (info->in[i].sn == TGSI_SEMANTIC_COLOR) { 448 fp->fp.colors |= 1 << info->in[i].si; 449 if (info->in[i].sc) 450 fp->fp.color_interp[info->in[i].si] = m | (info->in[i].mask << 4); 451 } 452 for (c = 0; c < 4; ++c) { 453 if (!(info->in[i].mask & (1 << c))) 454 continue; 455 a = info->in[i].slot[c]; 456 if (info->in[i].slot[0] >= (0x060 / 4) && 457 info->in[i].slot[0] <= (0x07c / 4)) { 458 fp->hdr[5] |= 1 << (24 + (a - 0x060 / 4)); 459 } else 460 if (info->in[i].slot[0] >= (0x2c0 / 4) && 461 info->in[i].slot[0] <= (0x2fc / 4)) { 462 fp->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000; 463 } else { 464 if (info->in[i].slot[c] < (0x040 / 4) || 465 info->in[i].slot[c] > (0x380 / 4)) 466 continue; 467 a *= 2; 468 if (info->in[i].slot[0] >= (0x300 / 4)) 469 a -= 32; 470 fp->hdr[4 + a / 32] |= m << (a % 32); 471 } 472 } 473 } 474 475 for (i = 0; i < info->numOutputs; ++i) { 476 if (info->out[i].sn == TGSI_SEMANTIC_COLOR) 477 fp->hdr[18] |= 0xf << info->out[i].slot[0]; 478 } 479 480 /* There are no "regular" attachments, but the shader still needs to be 481 * executed. It seems like it wants to think that it has some color 482 * outputs in order to actually run. 483 */ 484 if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth) 485 fp->hdr[18] |= 0xf; 486 487 fp->fp.early_z = info->prop.fp.earlyFragTests; 488 fp->fp.sample_mask_in = info->prop.fp.usesSampleMaskIn; 489 fp->fp.reads_framebuffer = info->prop.fp.readsFramebuffer; 490 491 /* Mark position xy and layer as read */ 492 if (fp->fp.reads_framebuffer) 493 fp->hdr[5] |= 0x32000000; 494 495 return 0; 496 } 497 498 static struct nvc0_transform_feedback_state * 499 nvc0_program_create_tfb_state(const struct nv50_ir_prog_info *info, 500 const struct pipe_stream_output_info *pso) 501 { 502 struct nvc0_transform_feedback_state *tfb; 503 unsigned b, i, c; 504 505 tfb = MALLOC_STRUCT(nvc0_transform_feedback_state); 506 if (!tfb) 507 return NULL; 508 for (b = 0; b < 4; ++b) { 509 tfb->stride[b] = pso->stride[b] * 4; 510 tfb->varying_count[b] = 0; 511 } 512 memset(tfb->varying_index, 0xff, sizeof(tfb->varying_index)); /* = skip */ 513 514 for (i = 0; i < pso->num_outputs; ++i) { 515 unsigned s = pso->output[i].start_component; 516 unsigned p = pso->output[i].dst_offset; 517 const unsigned r = pso->output[i].register_index; 518 b = pso->output[i].output_buffer; 519 520 if (r >= info->numOutputs) 521 continue; 522 523 for (c = 0; c < pso->output[i].num_components; ++c) 524 tfb->varying_index[b][p++] = info->out[r].slot[s + c]; 525 526 tfb->varying_count[b] = MAX2(tfb->varying_count[b], p); 527 tfb->stream[b] = pso->output[i].stream; 528 } 529 for (b = 0; b < 4; ++b) // zero unused indices (looks nicer) 530 for (c = tfb->varying_count[b]; c & 3; ++c) 531 tfb->varying_index[b][c] = 0; 532 533 return tfb; 534 } 535 536 #ifdef DEBUG 537 static void 538 nvc0_program_dump(struct nvc0_program *prog) 539 { 540 unsigned pos; 541 542 if (prog->type != PIPE_SHADER_COMPUTE) { 543 for (pos = 0; pos < ARRAY_SIZE(prog->hdr); ++pos) 544 debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n", 545 pos * sizeof(prog->hdr[0]), prog->hdr[pos]); 546 } 547 debug_printf("shader binary code (0x%x bytes):", prog->code_size); 548 for (pos = 0; pos < prog->code_size / 4; ++pos) { 549 if ((pos % 8) == 0) 550 debug_printf("\n"); 551 debug_printf("%08x ", prog->code[pos]); 552 } 553 debug_printf("\n"); 554 } 555 #endif 556 557 bool 558 nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, 559 struct pipe_debug_callback *debug) 560 { 561 struct nv50_ir_prog_info *info; 562 int ret; 563 564 info = CALLOC_STRUCT(nv50_ir_prog_info); 565 if (!info) 566 return false; 567 568 info->type = prog->type; 569 info->target = chipset; 570 info->bin.sourceRep = NV50_PROGRAM_IR_TGSI; 571 info->bin.source = (void *)prog->pipe.tokens; 572 573 #ifdef DEBUG 574 info->target = debug_get_num_option("NV50_PROG_CHIPSET", chipset); 575 info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3); 576 info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0); 577 #else 578 info->optLevel = 3; 579 #endif 580 581 info->io.genUserClip = prog->vp.num_ucps; 582 info->io.auxCBSlot = 15; 583 info->io.msInfoCBSlot = 15; 584 info->io.ucpBase = NVC0_CB_AUX_UCP_INFO; 585 info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO; 586 info->io.msInfoBase = NVC0_CB_AUX_MS_INFO; 587 info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0); 588 info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0); 589 if (info->target >= NVISA_GK104_CHIPSET) { 590 info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0); 591 info->io.fbtexBindBase = NVC0_CB_AUX_FB_TEX_INFO; 592 } 593 594 if (prog->type == PIPE_SHADER_COMPUTE) { 595 if (info->target >= NVISA_GK104_CHIPSET) { 596 info->io.auxCBSlot = 7; 597 info->io.msInfoCBSlot = 7; 598 info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0); 599 } 600 info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO(0); 601 } else { 602 info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO; 603 } 604 605 info->assignSlots = nvc0_program_assign_varying_slots; 606 607 ret = nv50_ir_generate_code(info); 608 if (ret) { 609 NOUVEAU_ERR("shader translation failed: %i\n", ret); 610 goto out; 611 } 612 if (prog->type != PIPE_SHADER_COMPUTE) 613 FREE(info->bin.syms); 614 615 prog->code = info->bin.code; 616 prog->code_size = info->bin.codeSize; 617 prog->relocs = info->bin.relocData; 618 prog->fixups = info->bin.fixupData; 619 prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1)); 620 prog->num_barriers = info->numBarriers; 621 622 prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS; 623 prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters; 624 625 if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS) 626 info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */ 627 prog->vp.edgeflag = info->io.edgeFlagIn; 628 629 switch (prog->type) { 630 case PIPE_SHADER_VERTEX: 631 ret = nvc0_vp_gen_header(prog, info); 632 break; 633 case PIPE_SHADER_TESS_CTRL: 634 ret = nvc0_tcp_gen_header(prog, info); 635 break; 636 case PIPE_SHADER_TESS_EVAL: 637 ret = nvc0_tep_gen_header(prog, info); 638 break; 639 case PIPE_SHADER_GEOMETRY: 640 ret = nvc0_gp_gen_header(prog, info); 641 break; 642 case PIPE_SHADER_FRAGMENT: 643 ret = nvc0_fp_gen_header(prog, info); 644 break; 645 case PIPE_SHADER_COMPUTE: 646 prog->cp.syms = info->bin.syms; 647 prog->cp.num_syms = info->bin.numSyms; 648 break; 649 default: 650 ret = -1; 651 NOUVEAU_ERR("unknown program type: %u\n", prog->type); 652 break; 653 } 654 if (ret) 655 goto out; 656 657 if (info->bin.tlsSpace) { 658 assert(info->bin.tlsSpace < (1 << 24)); 659 prog->hdr[0] |= 1 << 26; 660 prog->hdr[1] |= align(info->bin.tlsSpace, 0x10); /* l[] size */ 661 prog->need_tls = true; 662 } 663 /* TODO: factor 2 only needed where joinat/precont is used, 664 * and we only have to count non-uniform branches 665 */ 666 /* 667 if ((info->maxCFDepth * 2) > 16) { 668 prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200; 669 prog->need_tls = true; 670 } 671 */ 672 if (info->io.globalAccess) 673 prog->hdr[0] |= 1 << 26; 674 if (info->io.globalAccess & 0x2) 675 prog->hdr[0] |= 1 << 16; 676 if (info->io.fp64) 677 prog->hdr[0] |= 1 << 27; 678 679 if (prog->pipe.stream_output.num_outputs) 680 prog->tfb = nvc0_program_create_tfb_state(info, 681 &prog->pipe.stream_output); 682 683 pipe_debug_message(debug, SHADER_INFO, 684 "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d", 685 prog->type, info->bin.tlsSpace, prog->num_gprs, 686 info->bin.instructions, info->bin.codeSize); 687 688 #ifdef DEBUG 689 if (debug_get_option("NV50_PROG_CHIPSET", NULL) && info->dbgFlags) 690 nvc0_program_dump(prog); 691 #endif 692 693 out: 694 FREE(info); 695 return !ret; 696 } 697 698 static inline int 699 nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog) 700 { 701 struct nvc0_screen *screen = nvc0->screen; 702 const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; 703 int ret; 704 uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); 705 706 /* On Fermi, SP_START_ID must be aligned to 0x40. 707 * On Kepler, the first instruction must be aligned to 0x80 because 708 * latency information is expected only at certain positions. 709 */ 710 if (screen->base.class_3d >= NVE4_3D_CLASS) 711 size = size + (is_cp ? 0x40 : 0x70); 712 size = align(size, 0x40); 713 714 ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem); 715 if (ret) 716 return ret; 717 prog->code_base = prog->mem->start; 718 719 if (!is_cp) { 720 if (screen->base.class_3d >= NVE4_3D_CLASS) { 721 switch (prog->mem->start & 0xff) { 722 case 0x40: prog->code_base += 0x70; break; 723 case 0x80: prog->code_base += 0x30; break; 724 case 0xc0: prog->code_base += 0x70; break; 725 default: 726 prog->code_base += 0x30; 727 assert((prog->mem->start & 0xff) == 0x00); 728 break; 729 } 730 } 731 } else { 732 if (screen->base.class_3d >= NVE4_3D_CLASS) { 733 if (prog->mem->start & 0x40) 734 prog->code_base += 0x40; 735 assert((prog->code_base & 0x7f) == 0x00); 736 } 737 } 738 739 return 0; 740 } 741 742 static inline void 743 nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) 744 { 745 struct nvc0_screen *screen = nvc0->screen; 746 const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; 747 uint32_t code_pos = prog->code_base + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); 748 749 if (prog->relocs) 750 nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, 751 screen->lib_code->start, 0); 752 if (prog->fixups) { 753 nv50_ir_apply_fixups(prog->fixups, prog->code, 754 prog->fp.force_persample_interp, 755 prog->fp.flatshade, 756 0 /* alphatest */); 757 for (int i = 0; i < 2; i++) { 758 unsigned mask = prog->fp.color_interp[i] >> 4; 759 unsigned interp = prog->fp.color_interp[i] & 3; 760 if (!mask) 761 continue; 762 prog->hdr[14] &= ~(0xff << (8 * i)); 763 if (prog->fp.flatshade) 764 interp = NVC0_INTERP_FLAT; 765 for (int c = 0; c < 4; c++) 766 if (mask & (1 << c)) 767 prog->hdr[14] |= interp << (2 * (4 * i + c)); 768 } 769 } 770 771 if (!is_cp) 772 nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base, 773 NV_VRAM_DOMAIN(&screen->base), 774 NVC0_SHADER_HEADER_SIZE, prog->hdr); 775 776 nvc0->base.push_data(&nvc0->base, screen->text, code_pos, 777 NV_VRAM_DOMAIN(&screen->base), prog->code_size, 778 prog->code); 779 } 780 781 bool 782 nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog) 783 { 784 struct nvc0_screen *screen = nvc0->screen; 785 const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; 786 int ret; 787 uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); 788 789 ret = nvc0_program_alloc_code(nvc0, prog); 790 if (ret) { 791 struct nouveau_heap *heap = screen->text_heap; 792 struct nvc0_program *progs[] = { /* Sorted accordingly to SP_START_ID */ 793 nvc0->compprog, nvc0->vertprog, nvc0->tctlprog, 794 nvc0->tevlprog, nvc0->gmtyprog, nvc0->fragprog 795 }; 796 797 /* Note that the code library, which is allocated before anything else, 798 * does not have a priv pointer. We can stop once we hit it. 799 */ 800 while (heap->next && heap->next->priv) { 801 struct nvc0_program *evict = heap->next->priv; 802 nouveau_heap_free(&evict->mem); 803 } 804 debug_printf("WARNING: out of code space, evicting all shaders.\n"); 805 806 /* Make sure to synchronize before deleting the code segment. */ 807 IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0); 808 809 if ((screen->text->size << 1) <= (1 << 23)) { 810 ret = nvc0_screen_resize_text_area(screen, screen->text->size << 1); 811 if (ret) { 812 NOUVEAU_ERR("Error allocating TEXT area: %d\n", ret); 813 return false; 814 } 815 nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEXT); 816 BCTX_REFN_bo(nvc0->bufctx_3d, 3D_TEXT, 817 NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD, 818 screen->text); 819 if (screen->compute) { 820 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_TEXT); 821 BCTX_REFN_bo(nvc0->bufctx_cp, CP_TEXT, 822 NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD, 823 screen->text); 824 } 825 826 /* Re-upload the builtin function into the new code segment. */ 827 nvc0_program_library_upload(nvc0); 828 } 829 830 ret = nvc0_program_alloc_code(nvc0, prog); 831 if (ret) { 832 NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size); 833 return false; 834 } 835 836 /* All currently bound shaders have to be reuploaded. */ 837 for (int i = 0; i < ARRAY_SIZE(progs); i++) { 838 if (!progs[i] || progs[i] == prog) 839 continue; 840 841 ret = nvc0_program_alloc_code(nvc0, progs[i]); 842 if (ret) { 843 NOUVEAU_ERR("failed to re-upload a shader after code eviction.\n"); 844 return false; 845 } 846 nvc0_program_upload_code(nvc0, progs[i]); 847 848 if (progs[i]->type == PIPE_SHADER_COMPUTE) { 849 /* Caches have to be invalidated but the CP_START_ID will be 850 * updated in the launch_grid functions. */ 851 BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(FLUSH), 1); 852 PUSH_DATA (nvc0->base.pushbuf, NVC0_COMPUTE_FLUSH_CODE); 853 } else { 854 BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(SP_START_ID(i)), 1); 855 PUSH_DATA (nvc0->base.pushbuf, progs[i]->code_base); 856 } 857 } 858 } 859 860 nvc0_program_upload_code(nvc0, prog); 861 862 #ifdef DEBUG 863 if (debug_get_bool_option("NV50_PROG_DEBUG", false)) 864 nvc0_program_dump(prog); 865 #endif 866 867 BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1); 868 PUSH_DATA (nvc0->base.pushbuf, 0x1011); 869 870 return true; 871 } 872 873 /* Upload code for builtin functions like integer division emulation. */ 874 void 875 nvc0_program_library_upload(struct nvc0_context *nvc0) 876 { 877 struct nvc0_screen *screen = nvc0->screen; 878 int ret; 879 uint32_t size; 880 const uint32_t *code; 881 882 if (screen->lib_code) 883 return; 884 885 nv50_ir_get_target_library(screen->base.device->chipset, &code, &size); 886 if (!size) 887 return; 888 889 ret = nouveau_heap_alloc(screen->text_heap, align(size, 0x100), NULL, 890 &screen->lib_code); 891 if (ret) 892 return; 893 894 nvc0->base.push_data(&nvc0->base, 895 screen->text, screen->lib_code->start, NV_VRAM_DOMAIN(&screen->base), 896 size, code); 897 /* no need for a memory barrier, will be emitted with first program */ 898 } 899 900 void 901 nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog) 902 { 903 const struct pipe_shader_state pipe = prog->pipe; 904 const ubyte type = prog->type; 905 906 if (prog->mem) 907 nouveau_heap_free(&prog->mem); 908 FREE(prog->code); /* may be 0 for hardcoded shaders */ 909 FREE(prog->relocs); 910 FREE(prog->fixups); 911 if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms) 912 FREE(prog->cp.syms); 913 if (prog->tfb) { 914 if (nvc0->state.tfb == prog->tfb) 915 nvc0->state.tfb = NULL; 916 FREE(prog->tfb); 917 } 918 919 memset(prog, 0, sizeof(*prog)); 920 921 prog->pipe = pipe; 922 prog->type = type; 923 } 924 925 uint32_t 926 nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label) 927 { 928 const struct nv50_ir_prog_symbol *syms = 929 (const struct nv50_ir_prog_symbol *)prog->cp.syms; 930 unsigned base = 0; 931 unsigned i; 932 if (prog->type != PIPE_SHADER_COMPUTE) 933 base = NVC0_SHADER_HEADER_SIZE; 934 for (i = 0; i < prog->cp.num_syms; ++i) 935 if (syms[i].label == label) 936 return prog->code_base + base + syms[i].offset; 937 return prog->code_base; /* no symbols or symbol not found */ 938 } 939 940 void 941 nvc0_program_init_tcp_empty(struct nvc0_context *nvc0) 942 { 943 struct ureg_program *ureg; 944 945 ureg = ureg_create(PIPE_SHADER_TESS_CTRL); 946 if (!ureg) 947 return; 948 949 ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1); 950 ureg_END(ureg); 951 952 nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe); 953 } 954