Home | History | Annotate | Download | only in nv50
      1 /*
      2  * Copyright 2010 Christoph Bumiller
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "nv50/nv50_program.h"
     24 #include "nv50/nv50_context.h"
     25 
     26 #include "codegen/nv50_ir_driver.h"
     27 
     28 static inline unsigned
     29 bitcount4(const uint32_t val)
     30 {
     31    static const uint8_t cnt[16]
     32    = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
     33    return cnt[val & 0xf];
     34 }
     35 
     36 static int
     37 nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
     38 {
     39    struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
     40    unsigned i, n, c;
     41 
     42    n = 0;
     43    for (i = 0; i < info->numInputs; ++i) {
     44       prog->in[i].id = i;
     45       prog->in[i].sn = info->in[i].sn;
     46       prog->in[i].si = info->in[i].si;
     47       prog->in[i].hw = n;
     48       prog->in[i].mask = info->in[i].mask;
     49 
     50       prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
     51 
     52       for (c = 0; c < 4; ++c)
     53          if (info->in[i].mask & (1 << c))
     54             info->in[i].slot[c] = n++;
     55 
     56       if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
     57          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
     58    }
     59    prog->in_nr = info->numInputs;
     60 
     61    for (i = 0; i < info->numSysVals; ++i) {
     62       switch (info->sv[i].sn) {
     63       case TGSI_SEMANTIC_INSTANCEID:
     64          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
     65          continue;
     66       case TGSI_SEMANTIC_VERTEXID:
     67          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
     68          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
     69          continue;
     70       default:
     71          break;
     72       }
     73    }
     74 
     75    /*
     76     * Corner case: VP has no inputs, but we will still need to submit data to
     77     * draw it. HW will shout at us and won't draw anything if we don't enable
     78     * any input, so let's just pretend it's the first one.
     79     */
     80    if (prog->vp.attrs[0] == 0 &&
     81        prog->vp.attrs[1] == 0 &&
     82        prog->vp.attrs[2] == 0)
     83       prog->vp.attrs[0] |= 0xf;
     84 
     85    /* VertexID before InstanceID */
     86    if (info->io.vertexId < info->numSysVals)
     87       info->sv[info->io.vertexId].slot[0] = n++;
     88    if (info->io.instanceId < info->numSysVals)
     89       info->sv[info->io.instanceId].slot[0] = n++;
     90 
     91    n = 0;
     92    for (i = 0; i < info->numOutputs; ++i) {
     93       switch (info->out[i].sn) {
     94       case TGSI_SEMANTIC_PSIZE:
     95          prog->vp.psiz = i;
     96          break;
     97       case TGSI_SEMANTIC_CLIPDIST:
     98          prog->vp.clpd[info->out[i].si] = n;
     99          break;
    100       case TGSI_SEMANTIC_EDGEFLAG:
    101          prog->vp.edgeflag = i;
    102          break;
    103       case TGSI_SEMANTIC_BCOLOR:
    104          prog->vp.bfc[info->out[i].si] = i;
    105          break;
    106       case TGSI_SEMANTIC_LAYER:
    107          prog->gp.has_layer = true;
    108          prog->gp.layerid = n;
    109          break;
    110       case TGSI_SEMANTIC_VIEWPORT_INDEX:
    111          prog->gp.has_viewport = true;
    112          prog->gp.viewportid = n;
    113          break;
    114       default:
    115          break;
    116       }
    117       prog->out[i].id = i;
    118       prog->out[i].sn = info->out[i].sn;
    119       prog->out[i].si = info->out[i].si;
    120       prog->out[i].hw = n;
    121       prog->out[i].mask = info->out[i].mask;
    122 
    123       for (c = 0; c < 4; ++c)
    124          if (info->out[i].mask & (1 << c))
    125             info->out[i].slot[c] = n++;
    126    }
    127    prog->out_nr = info->numOutputs;
    128    prog->max_out = n;
    129    if (!prog->max_out)
    130       prog->max_out = 1;
    131 
    132    if (prog->vp.psiz < info->numOutputs)
    133       prog->vp.psiz = prog->out[prog->vp.psiz].hw;
    134 
    135    return 0;
    136 }
    137 
    138 static int
    139 nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
    140 {
    141    struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
    142    unsigned i, n, m, c;
    143    unsigned nvary;
    144    unsigned nflat;
    145    unsigned nintp = 0;
    146 
    147    /* count recorded non-flat inputs */
    148    for (m = 0, i = 0; i < info->numInputs; ++i) {
    149       switch (info->in[i].sn) {
    150       case TGSI_SEMANTIC_POSITION:
    151          continue;
    152       default:
    153          m += info->in[i].flat ? 0 : 1;
    154          break;
    155       }
    156    }
    157    /* careful: id may be != i in info->in[prog->in[i].id] */
    158 
    159    /* Fill prog->in[] so that non-flat inputs are first and
    160     * kick out special inputs that don't use the RESULT_MAP.
    161     */
    162    for (n = 0, i = 0; i < info->numInputs; ++i) {
    163       if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
    164          prog->fp.interp |= info->in[i].mask << 24;
    165          for (c = 0; c < 4; ++c)
    166             if (info->in[i].mask & (1 << c))
    167                info->in[i].slot[c] = nintp++;
    168       } else {
    169          unsigned j = info->in[i].flat ? m++ : n++;
    170 
    171          if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
    172             prog->vp.bfc[info->in[i].si] = j;
    173          else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
    174             prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
    175 
    176          prog->in[j].id = i;
    177          prog->in[j].mask = info->in[i].mask;
    178          prog->in[j].sn = info->in[i].sn;
    179          prog->in[j].si = info->in[i].si;
    180          prog->in[j].linear = info->in[i].linear;
    181 
    182          prog->in_nr++;
    183       }
    184    }
    185    if (!(prog->fp.interp & (8 << 24))) {
    186       ++nintp;
    187       prog->fp.interp |= 8 << 24;
    188    }
    189 
    190    for (i = 0; i < prog->in_nr; ++i) {
    191       int j = prog->in[i].id;
    192 
    193       prog->in[i].hw = nintp;
    194       for (c = 0; c < 4; ++c)
    195          if (prog->in[i].mask & (1 << c))
    196             info->in[j].slot[c] = nintp++;
    197    }
    198    /* (n == m) if m never increased, i.e. no flat inputs */
    199    nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
    200    nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
    201    nvary = nintp - nflat;
    202 
    203    prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
    204    prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
    205 
    206    /* put front/back colors right after HPOS */
    207    prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
    208    for (i = 0; i < 2; ++i)
    209       if (prog->vp.bfc[i] < 0xff)
    210          prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
    211 
    212    /* FP outputs */
    213 
    214    if (info->prop.fp.numColourResults > 1)
    215       prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
    216 
    217    for (i = 0; i < info->numOutputs; ++i) {
    218       prog->out[i].id = i;
    219       prog->out[i].sn = info->out[i].sn;
    220       prog->out[i].si = info->out[i].si;
    221       prog->out[i].mask = info->out[i].mask;
    222 
    223       if (i == info->io.fragDepth || i == info->io.sampleMask)
    224          continue;
    225       prog->out[i].hw = info->out[i].si * 4;
    226 
    227       for (c = 0; c < 4; ++c)
    228          info->out[i].slot[c] = prog->out[i].hw + c;
    229 
    230       prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
    231    }
    232 
    233    if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) {
    234       info->out[info->io.sampleMask].slot[0] = prog->max_out++;
    235       prog->fp.has_samplemask = 1;
    236    }
    237 
    238    if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
    239       info->out[info->io.fragDepth].slot[2] = prog->max_out++;
    240 
    241    if (!prog->max_out)
    242       prog->max_out = 4;
    243 
    244    return 0;
    245 }
    246 
    247 static int
    248 nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
    249 {
    250    switch (info->type) {
    251    case PIPE_SHADER_VERTEX:
    252       return nv50_vertprog_assign_slots(info);
    253    case PIPE_SHADER_GEOMETRY:
    254       return nv50_vertprog_assign_slots(info);
    255    case PIPE_SHADER_FRAGMENT:
    256       return nv50_fragprog_assign_slots(info);
    257    case PIPE_SHADER_COMPUTE:
    258       return 0;
    259    default:
    260       return -1;
    261    }
    262 }
    263 
    264 static struct nv50_stream_output_state *
    265 nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
    266                                   const struct pipe_stream_output_info *pso)
    267 {
    268    struct nv50_stream_output_state *so;
    269    unsigned b, i, c;
    270    unsigned base[4];
    271 
    272    so = MALLOC_STRUCT(nv50_stream_output_state);
    273    if (!so)
    274       return NULL;
    275    memset(so->map, 0xff, sizeof(so->map));
    276 
    277    for (b = 0; b < 4; ++b)
    278       so->num_attribs[b] = 0;
    279    for (i = 0; i < pso->num_outputs; ++i) {
    280       unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
    281       b = pso->output[i].output_buffer;
    282       assert(b < 4);
    283       so->num_attribs[b] = MAX2(so->num_attribs[b], end);
    284    }
    285 
    286    so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
    287 
    288    so->stride[0] = pso->stride[0] * 4;
    289    base[0] = 0;
    290    for (b = 1; b < 4; ++b) {
    291       assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
    292       so->stride[b] = so->num_attribs[b] * 4;
    293       if (so->num_attribs[b])
    294          so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
    295       base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
    296    }
    297    if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
    298       assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
    299       so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
    300    }
    301 
    302    so->map_size = base[3] + so->num_attribs[3];
    303 
    304    for (i = 0; i < pso->num_outputs; ++i) {
    305       const unsigned s = pso->output[i].start_component;
    306       const unsigned p = pso->output[i].dst_offset;
    307       const unsigned r = pso->output[i].register_index;
    308       b = pso->output[i].output_buffer;
    309 
    310       if (r >= info->numOutputs)
    311          continue;
    312 
    313       for (c = 0; c < pso->output[i].num_components; ++c)
    314          so->map[base[b] + p + c] = info->out[r].slot[s + c];
    315    }
    316 
    317    return so;
    318 }
    319 
    320 bool
    321 nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    322                        struct pipe_debug_callback *debug)
    323 {
    324    struct nv50_ir_prog_info *info;
    325    int i, ret;
    326    const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
    327 
    328    info = CALLOC_STRUCT(nv50_ir_prog_info);
    329    if (!info)
    330       return false;
    331 
    332    info->type = prog->type;
    333    info->target = chipset;
    334    info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    335    info->bin.source = (void *)prog->pipe.tokens;
    336 
    337    info->io.auxCBSlot = 15;
    338    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
    339    info->io.genUserClip = prog->vp.clpd_nr;
    340    if (prog->fp.alphatest)
    341       info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET;
    342 
    343    info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
    344    info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
    345    info->io.msInfoCBSlot = 15;
    346    info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
    347 
    348    info->assignSlots = nv50_program_assign_varying_slots;
    349 
    350    prog->vp.bfc[0] = 0xff;
    351    prog->vp.bfc[1] = 0xff;
    352    prog->vp.edgeflag = 0xff;
    353    prog->vp.clpd[0] = map_undef;
    354    prog->vp.clpd[1] = map_undef;
    355    prog->vp.psiz = map_undef;
    356    prog->gp.has_layer = 0;
    357    prog->gp.has_viewport = 0;
    358 
    359    if (prog->type == PIPE_SHADER_COMPUTE)
    360       info->prop.cp.inputOffset = 0x10;
    361 
    362    info->driverPriv = prog;
    363 
    364 #ifdef DEBUG
    365    info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
    366    info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
    367 #else
    368    info->optLevel = 3;
    369 #endif
    370 
    371    ret = nv50_ir_generate_code(info);
    372    if (ret) {
    373       NOUVEAU_ERR("shader translation failed: %i\n", ret);
    374       goto out;
    375    }
    376 
    377    prog->code = info->bin.code;
    378    prog->code_size = info->bin.codeSize;
    379    prog->fixups = info->bin.relocData;
    380    prog->interps = info->bin.fixupData;
    381    prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
    382    prog->tls_space = info->bin.tlsSpace;
    383 
    384    prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
    385 
    386    prog->vp.clip_enable = (1 << info->io.clipDistances) - 1;
    387    prog->vp.cull_enable =
    388       ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
    389    prog->vp.clip_mode = 0;
    390    for (i = 0; i < info->io.cullDistances; ++i)
    391       prog->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
    392 
    393    if (prog->type == PIPE_SHADER_FRAGMENT) {
    394       if (info->prop.fp.writesDepth) {
    395          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
    396          prog->fp.flags[1] = 0x11;
    397       }
    398       if (info->prop.fp.usesDiscard)
    399          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
    400    } else
    401    if (prog->type == PIPE_SHADER_GEOMETRY) {
    402       switch (info->prop.gp.outputPrim) {
    403       case PIPE_PRIM_LINE_STRIP:
    404          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
    405          break;
    406       case PIPE_PRIM_TRIANGLE_STRIP:
    407          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
    408          break;
    409       case PIPE_PRIM_POINTS:
    410       default:
    411          assert(info->prop.gp.outputPrim == PIPE_PRIM_POINTS);
    412          prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
    413          break;
    414       }
    415       prog->gp.vert_count = CLAMP(info->prop.gp.maxVertices, 1, 1024);
    416    }
    417 
    418    if (prog->type == PIPE_SHADER_COMPUTE) {
    419       prog->cp.syms = info->bin.syms;
    420       prog->cp.num_syms = info->bin.numSyms;
    421    } else {
    422       FREE(info->bin.syms);
    423    }
    424 
    425    if (prog->pipe.stream_output.num_outputs)
    426       prog->so = nv50_program_create_strmout_state(info,
    427                                                    &prog->pipe.stream_output);
    428 
    429    pipe_debug_message(debug, SHADER_INFO,
    430                       "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
    431                       prog->type, info->bin.tlsSpace, prog->max_gpr,
    432                       info->bin.instructions, info->bin.codeSize);
    433 
    434 out:
    435    FREE(info);
    436    return !ret;
    437 }
    438 
    439 bool
    440 nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    441 {
    442    struct nouveau_heap *heap;
    443    int ret;
    444    uint32_t size = align(prog->code_size, 0x40);
    445    uint8_t prog_type;
    446 
    447    switch (prog->type) {
    448    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
    449    case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
    450    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
    451    case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
    452    default:
    453       assert(!"invalid program type");
    454       return false;
    455    }
    456 
    457    ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
    458    if (ret) {
    459       /* Out of space: evict everything to compactify the code segment, hoping
    460        * the working set is much smaller and drifts slowly. Improve me !
    461        */
    462       while (heap->next) {
    463          struct nv50_program *evict = heap->next->priv;
    464          if (evict)
    465             nouveau_heap_free(&evict->mem);
    466       }
    467       debug_printf("WARNING: out of code space, evicting all shaders.\n");
    468       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
    469       if (ret) {
    470          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
    471          return false;
    472       }
    473    }
    474 
    475    if (prog->type == PIPE_SHADER_COMPUTE) {
    476       /* CP code must be uploaded in FP code segment. */
    477       prog_type = 1;
    478    } else {
    479       prog->code_base = prog->mem->start;
    480       prog_type = prog->type;
    481    }
    482 
    483    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    484    if (ret < 0) {
    485       nouveau_heap_free(&prog->mem);
    486       return false;
    487    }
    488    if (ret > 0)
    489       nv50->state.new_tls_space = true;
    490 
    491    if (prog->fixups)
    492       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
    493    if (prog->interps)
    494       nv50_ir_apply_fixups(prog->interps, prog->code,
    495                            prog->fp.force_persample_interp,
    496                            false /* flatshade */,
    497                            prog->fp.alphatest - 1);
    498 
    499    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
    500                        (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
    501                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
    502 
    503    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
    504    PUSH_DATA (nv50->base.pushbuf, 0);
    505 
    506    return true;
    507 }
    508 
    509 void
    510 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
    511 {
    512    const struct pipe_shader_state pipe = p->pipe;
    513    const ubyte type = p->type;
    514 
    515    if (p->mem)
    516       nouveau_heap_free(&p->mem);
    517 
    518    FREE(p->code);
    519 
    520    FREE(p->fixups);
    521    FREE(p->interps);
    522    FREE(p->so);
    523 
    524    if (type == PIPE_SHADER_COMPUTE)
    525       FREE(p->cp.syms);
    526 
    527    memset(p, 0, sizeof(*p));
    528 
    529    p->pipe = pipe;
    530    p->type = type;
    531 }
    532