Home | History | Annotate | Download | only in nvc0
      1 /*
      2  * Copyright 2010 Christoph Bumiller
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     20  * OTHER DEALINGS IN THE SOFTWARE.
     21  */
     22 
     23 #include "pipe/p_defines.h"
     24 
     25 #include "tgsi/tgsi_ureg.h"
     26 
     27 #include "nvc0/nvc0_context.h"
     28 
     29 #include "codegen/nv50_ir_driver.h"
     30 #include "nvc0/nve4_compute.h"
     31 
     32 /* NOTE: Using a[0x270] in FP may cause an error even if we're using less than
     33  * 124 scalar varying values.
     34  */
     35 static uint32_t
     36 nvc0_shader_input_address(unsigned sn, unsigned si)
     37 {
     38    switch (sn) {
     39    case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
     40    case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
     41    case TGSI_SEMANTIC_PATCH:        return 0x020 + si * 0x10;
     42    case TGSI_SEMANTIC_PRIMID:       return 0x060;
     43    case TGSI_SEMANTIC_LAYER:        return 0x064;
     44    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
     45    case TGSI_SEMANTIC_PSIZE:        return 0x06c;
     46    case TGSI_SEMANTIC_POSITION:     return 0x070;
     47    case TGSI_SEMANTIC_GENERIC:      return 0x080 + si * 0x10;
     48    case TGSI_SEMANTIC_FOG:          return 0x2e8;
     49    case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
     50    case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
     51    case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
     52    case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
     53    case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
     54    case TGSI_SEMANTIC_TESSCOORD:    return 0x2f0;
     55    case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
     56    case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
     57    case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
     58    default:
     59       assert(!"invalid TGSI input semantic");
     60       return ~0;
     61    }
     62 }
     63 
     64 static uint32_t
     65 nvc0_shader_output_address(unsigned sn, unsigned si)
     66 {
     67    switch (sn) {
     68    case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
     69    case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
     70    case TGSI_SEMANTIC_PATCH:         return 0x020 + si * 0x10;
     71    case TGSI_SEMANTIC_PRIMID:        return 0x060;
     72    case TGSI_SEMANTIC_LAYER:         return 0x064;
     73    case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
     74    case TGSI_SEMANTIC_PSIZE:         return 0x06c;
     75    case TGSI_SEMANTIC_POSITION:      return 0x070;
     76    case TGSI_SEMANTIC_GENERIC:       return 0x080 + si * 0x10;
     77    case TGSI_SEMANTIC_FOG:           return 0x2e8;
     78    case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
     79    case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
     80    case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
     81    case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
     82    case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;
     83    /* case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0; */
     84    case TGSI_SEMANTIC_EDGEFLAG:      return ~0;
     85    default:
     86       assert(!"invalid TGSI output semantic");
     87       return ~0;
     88    }
     89 }
     90 
     91 static int
     92 nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
     93 {
     94    unsigned i, c, n;
     95 
     96    for (n = 0, i = 0; i < info->numInputs; ++i) {
     97       switch (info->in[i].sn) {
     98       case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
     99       case TGSI_SEMANTIC_VERTEXID:
    100          info->in[i].mask = 0x1;
    101          info->in[i].slot[0] =
    102             nvc0_shader_input_address(info->in[i].sn, 0) / 4;
    103          continue;
    104       default:
    105          break;
    106       }
    107       for (c = 0; c < 4; ++c)
    108          info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4;
    109       ++n;
    110    }
    111 
    112    return 0;
    113 }
    114 
    115 static int
    116 nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info)
    117 {
    118    unsigned offset;
    119    unsigned i, c;
    120 
    121    for (i = 0; i < info->numInputs; ++i) {
    122       offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
    123 
    124       for (c = 0; c < 4; ++c)
    125          info->in[i].slot[c] = (offset + c * 0x4) / 4;
    126    }
    127 
    128    return 0;
    129 }
    130 
    131 static int
    132 nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info)
    133 {
    134    unsigned count = info->prop.fp.numColourResults * 4;
    135    unsigned i, c;
    136 
    137    for (i = 0; i < info->numOutputs; ++i)
    138       if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
    139          for (c = 0; c < 4; ++c)
    140             info->out[i].slot[c] = info->out[i].si * 4 + c;
    141 
    142    if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
    143       info->out[info->io.sampleMask].slot[0] = count++;
    144    else
    145    if (info->target >= 0xe0)
    146       count++; /* on Kepler, depth is always last colour reg + 2 */
    147 
    148    if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
    149       info->out[info->io.fragDepth].slot[2] = count;
    150 
    151    return 0;
    152 }
    153 
    154 static int
    155 nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info)
    156 {
    157    unsigned offset;
    158    unsigned i, c;
    159 
    160    for (i = 0; i < info->numOutputs; ++i) {
    161       offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
    162 
    163       for (c = 0; c < 4; ++c)
    164          info->out[i].slot[c] = (offset + c * 0x4) / 4;
    165    }
    166 
    167    return 0;
    168 }
    169 
    170 static int
    171 nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info)
    172 {
    173    int ret;
    174 
    175    if (info->type == PIPE_SHADER_VERTEX)
    176       ret = nvc0_vp_assign_input_slots(info);
    177    else
    178       ret = nvc0_sp_assign_input_slots(info);
    179    if (ret)
    180       return ret;
    181 
    182    if (info->type == PIPE_SHADER_FRAGMENT)
    183       ret = nvc0_fp_assign_output_slots(info);
    184    else
    185       ret = nvc0_sp_assign_output_slots(info);
    186    return ret;
    187 }
    188 
    189 static inline void
    190 nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot)
    191 {
    192    uint8_t min = (vp->hdr[4] >> 12) & 0xff;
    193    uint8_t max = (vp->hdr[4] >> 24);
    194 
    195    min = MIN2(min, slot);
    196    max = MAX2(max, slot);
    197 
    198    vp->hdr[4] = (max << 24) | (min << 12);
    199 }
    200 
    201 /* Common part of header generation for VP, TCP, TEP and GP. */
    202 static int
    203 nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
    204 {
    205    unsigned i, c, a;
    206 
    207    for (i = 0; i < info->numInputs; ++i) {
    208       if (info->in[i].patch)
    209          continue;
    210       for (c = 0; c < 4; ++c) {
    211          a = info->in[i].slot[c];
    212          if (info->in[i].mask & (1 << c))
    213             vp->hdr[5 + a / 32] |= 1 << (a % 32);
    214       }
    215    }
    216 
    217    for (i = 0; i < info->numOutputs; ++i) {
    218       if (info->out[i].patch)
    219          continue;
    220       for (c = 0; c < 4; ++c) {
    221          if (!(info->out[i].mask & (1 << c)))
    222             continue;
    223          assert(info->out[i].slot[c] >= 0x40 / 4);
    224          a = info->out[i].slot[c] - 0x40 / 4;
    225          vp->hdr[13 + a / 32] |= 1 << (a % 32);
    226          if (info->out[i].oread)
    227             nvc0_vtgp_hdr_update_oread(vp, info->out[i].slot[c]);
    228       }
    229    }
    230 
    231    for (i = 0; i < info->numSysVals; ++i) {
    232       switch (info->sv[i].sn) {
    233       case TGSI_SEMANTIC_PRIMID:
    234          vp->hdr[5] |= 1 << 24;
    235          break;
    236       case TGSI_SEMANTIC_INSTANCEID:
    237          vp->hdr[10] |= 1 << 30;
    238          break;
    239       case TGSI_SEMANTIC_VERTEXID:
    240          vp->hdr[10] |= 1 << 31;
    241          break;
    242       case TGSI_SEMANTIC_TESSCOORD:
    243          /* We don't have the mask, nor the slots populated. While this could
    244           * be achieved, the vast majority of the time if either of the coords
    245           * are read, then both will be read.
    246           */
    247          nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4);
    248          nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4);
    249          break;
    250       default:
    251          break;
    252       }
    253    }
    254 
    255    vp->vp.clip_enable = (1 << info->io.clipDistances) - 1;
    256    vp->vp.cull_enable =
    257       ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
    258    for (i = 0; i < info->io.cullDistances; ++i)
    259       vp->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
    260 
    261    if (info->io.genUserClip < 0)
    262       vp->vp.num_ucps = PIPE_MAX_CLIP_PLANES + 1; /* prevent rebuilding */
    263 
    264    return 0;
    265 }
    266 
    267 static int
    268 nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
    269 {
    270    vp->hdr[0] = 0x20061 | (1 << 10);
    271    vp->hdr[4] = 0xff000;
    272 
    273    return nvc0_vtgp_gen_header(vp, info);
    274 }
    275 
    276 static void
    277 nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
    278 {
    279    if (info->prop.tp.outputPrim == PIPE_PRIM_MAX) {
    280       tp->tp.tess_mode = ~0;
    281       return;
    282    }
    283    switch (info->prop.tp.domain) {
    284    case PIPE_PRIM_LINES:
    285       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_ISOLINES;
    286       break;
    287    case PIPE_PRIM_TRIANGLES:
    288       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
    289       break;
    290    case PIPE_PRIM_QUADS:
    291       tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
    292       break;
    293    default:
    294       tp->tp.tess_mode = ~0;
    295       return;
    296    }
    297 
    298    /* It seems like lines want the "CW" bit to indicate they're connected, and
    299     * spit out errors in dmesg when the "CONNECTED" bit is set.
    300     */
    301    if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS) {
    302       if (info->prop.tp.domain == PIPE_PRIM_LINES)
    303          tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
    304       else
    305          tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
    306    }
    307 
    308    /* Winding only matters for triangles/quads, not lines. */
    309    if (info->prop.tp.domain != PIPE_PRIM_LINES &&
    310        info->prop.tp.outputPrim != PIPE_PRIM_POINTS &&
    311        info->prop.tp.winding > 0)
    312       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
    313 
    314    switch (info->prop.tp.partitioning) {
    315    case PIPE_TESS_SPACING_EQUAL:
    316       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL;
    317       break;
    318    case PIPE_TESS_SPACING_FRACTIONAL_ODD:
    319       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD;
    320       break;
    321    case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
    322       tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN;
    323       break;
    324    default:
    325       assert(!"invalid tessellator partitioning");
    326       break;
    327    }
    328 }
    329 
    330 static int
    331 nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
    332 {
    333    unsigned opcs = 6; /* output patch constants (at least the TessFactors) */
    334 
    335    tcp->tp.input_patch_size = info->prop.tp.inputPatchSize;
    336 
    337    if (info->numPatchConstants)
    338       opcs = 8 + info->numPatchConstants * 4;
    339 
    340    tcp->hdr[0] = 0x20061 | (2 << 10);
    341 
    342    tcp->hdr[1] = opcs << 24;
    343    tcp->hdr[2] = info->prop.tp.outputPatchSize << 24;
    344 
    345    tcp->hdr[4] = 0xff000; /* initial min/max parallel output read address */
    346 
    347    nvc0_vtgp_gen_header(tcp, info);
    348 
    349    if (info->target >= NVISA_GM107_CHIPSET) {
    350       /* On GM107+, the number of output patch components has moved in the TCP
    351        * header, but it seems like blob still also uses the old position.
    352        * Also, the high 8-bits are located inbetween the min/max parallel
    353        * field and has to be set after updating the outputs. */
    354       tcp->hdr[3] = (opcs & 0x0f) << 28;
    355       tcp->hdr[4] |= (opcs & 0xf0) << 16;
    356    }
    357 
    358    nvc0_tp_get_tess_mode(tcp, info);
    359 
    360    return 0;
    361 }
    362 
    363 static int
    364 nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
    365 {
    366    tep->tp.input_patch_size = ~0;
    367 
    368    tep->hdr[0] = 0x20061 | (3 << 10);
    369    tep->hdr[4] = 0xff000;
    370 
    371    nvc0_vtgp_gen_header(tep, info);
    372 
    373    nvc0_tp_get_tess_mode(tep, info);
    374 
    375    tep->hdr[18] |= 0x3 << 12; /* ? */
    376 
    377    return 0;
    378 }
    379 
    380 static int
    381 nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info)
    382 {
    383    gp->hdr[0] = 0x20061 | (4 << 10);
    384 
    385    gp->hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24;
    386 
    387    switch (info->prop.gp.outputPrim) {
    388    case PIPE_PRIM_POINTS:
    389       gp->hdr[3] = 0x01000000;
    390       gp->hdr[0] |= 0xf0000000;
    391       break;
    392    case PIPE_PRIM_LINE_STRIP:
    393       gp->hdr[3] = 0x06000000;
    394       gp->hdr[0] |= 0x10000000;
    395       break;
    396    case PIPE_PRIM_TRIANGLE_STRIP:
    397       gp->hdr[3] = 0x07000000;
    398       gp->hdr[0] |= 0x10000000;
    399       break;
    400    default:
    401       assert(0);
    402       break;
    403    }
    404 
    405    gp->hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024);
    406 
    407    return nvc0_vtgp_gen_header(gp, info);
    408 }
    409 
    410 #define NVC0_INTERP_FLAT          (1 << 0)
    411 #define NVC0_INTERP_PERSPECTIVE   (2 << 0)
    412 #define NVC0_INTERP_LINEAR        (3 << 0)
    413 #define NVC0_INTERP_CENTROID      (1 << 2)
    414 
    415 static uint8_t
    416 nvc0_hdr_interp_mode(const struct nv50_ir_varying *var)
    417 {
    418    if (var->linear)
    419       return NVC0_INTERP_LINEAR;
    420    if (var->flat)
    421       return NVC0_INTERP_FLAT;
    422    return NVC0_INTERP_PERSPECTIVE;
    423 }
    424 
    425 static int
    426 nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info)
    427 {
    428    unsigned i, c, a, m;
    429 
    430    /* just 00062 on Kepler */
    431    fp->hdr[0] = 0x20062 | (5 << 10);
    432    fp->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */
    433 
    434    if (info->prop.fp.usesDiscard)
    435       fp->hdr[0] |= 0x8000;
    436    if (info->prop.fp.numColourResults > 1)
    437       fp->hdr[0] |= 0x4000;
    438    if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
    439       fp->hdr[19] |= 0x1;
    440    if (info->prop.fp.writesDepth) {
    441       fp->hdr[19] |= 0x2;
    442       fp->flags[0] = 0x11; /* deactivate ZCULL */
    443    }
    444 
    445    for (i = 0; i < info->numInputs; ++i) {
    446       m = nvc0_hdr_interp_mode(&info->in[i]);
    447       if (info->in[i].sn == TGSI_SEMANTIC_COLOR) {
    448          fp->fp.colors |= 1 << info->in[i].si;
    449          if (info->in[i].sc)
    450             fp->fp.color_interp[info->in[i].si] = m | (info->in[i].mask << 4);
    451       }
    452       for (c = 0; c < 4; ++c) {
    453          if (!(info->in[i].mask & (1 << c)))
    454             continue;
    455          a = info->in[i].slot[c];
    456          if (info->in[i].slot[0] >= (0x060 / 4) &&
    457              info->in[i].slot[0] <= (0x07c / 4)) {
    458             fp->hdr[5] |= 1 << (24 + (a - 0x060 / 4));
    459          } else
    460          if (info->in[i].slot[0] >= (0x2c0 / 4) &&
    461              info->in[i].slot[0] <= (0x2fc / 4)) {
    462             fp->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000;
    463          } else {
    464             if (info->in[i].slot[c] < (0x040 / 4) ||
    465                 info->in[i].slot[c] > (0x380 / 4))
    466                continue;
    467             a *= 2;
    468             if (info->in[i].slot[0] >= (0x300 / 4))
    469                a -= 32;
    470             fp->hdr[4 + a / 32] |= m << (a % 32);
    471          }
    472       }
    473    }
    474 
    475    for (i = 0; i < info->numOutputs; ++i) {
    476       if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
    477          fp->hdr[18] |= 0xf << info->out[i].slot[0];
    478    }
    479 
    480    /* There are no "regular" attachments, but the shader still needs to be
    481     * executed. It seems like it wants to think that it has some color
    482     * outputs in order to actually run.
    483     */
    484    if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth)
    485       fp->hdr[18] |= 0xf;
    486 
    487    fp->fp.early_z = info->prop.fp.earlyFragTests;
    488    fp->fp.sample_mask_in = info->prop.fp.usesSampleMaskIn;
    489    fp->fp.reads_framebuffer = info->prop.fp.readsFramebuffer;
    490 
    491    /* Mark position xy and layer as read */
    492    if (fp->fp.reads_framebuffer)
    493       fp->hdr[5] |= 0x32000000;
    494 
    495    return 0;
    496 }
    497 
    498 static struct nvc0_transform_feedback_state *
    499 nvc0_program_create_tfb_state(const struct nv50_ir_prog_info *info,
    500                               const struct pipe_stream_output_info *pso)
    501 {
    502    struct nvc0_transform_feedback_state *tfb;
    503    unsigned b, i, c;
    504 
    505    tfb = MALLOC_STRUCT(nvc0_transform_feedback_state);
    506    if (!tfb)
    507       return NULL;
    508    for (b = 0; b < 4; ++b) {
    509       tfb->stride[b] = pso->stride[b] * 4;
    510       tfb->varying_count[b] = 0;
    511    }
    512    memset(tfb->varying_index, 0xff, sizeof(tfb->varying_index)); /* = skip */
    513 
    514    for (i = 0; i < pso->num_outputs; ++i) {
    515       unsigned s = pso->output[i].start_component;
    516       unsigned p = pso->output[i].dst_offset;
    517       const unsigned r = pso->output[i].register_index;
    518       b = pso->output[i].output_buffer;
    519 
    520       if (r >= info->numOutputs)
    521          continue;
    522 
    523       for (c = 0; c < pso->output[i].num_components; ++c)
    524          tfb->varying_index[b][p++] = info->out[r].slot[s + c];
    525 
    526       tfb->varying_count[b] = MAX2(tfb->varying_count[b], p);
    527       tfb->stream[b] = pso->output[i].stream;
    528    }
    529    for (b = 0; b < 4; ++b) // zero unused indices (looks nicer)
    530       for (c = tfb->varying_count[b]; c & 3; ++c)
    531          tfb->varying_index[b][c] = 0;
    532 
    533    return tfb;
    534 }
    535 
    536 #ifdef DEBUG
    537 static void
    538 nvc0_program_dump(struct nvc0_program *prog)
    539 {
    540    unsigned pos;
    541 
    542    if (prog->type != PIPE_SHADER_COMPUTE) {
    543       for (pos = 0; pos < ARRAY_SIZE(prog->hdr); ++pos)
    544          debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",
    545                       pos * sizeof(prog->hdr[0]), prog->hdr[pos]);
    546    }
    547    debug_printf("shader binary code (0x%x bytes):", prog->code_size);
    548    for (pos = 0; pos < prog->code_size / 4; ++pos) {
    549       if ((pos % 8) == 0)
    550          debug_printf("\n");
    551       debug_printf("%08x ", prog->code[pos]);
    552    }
    553    debug_printf("\n");
    554 }
    555 #endif
    556 
    557 bool
    558 nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
    559                        struct pipe_debug_callback *debug)
    560 {
    561    struct nv50_ir_prog_info *info;
    562    int ret;
    563 
    564    info = CALLOC_STRUCT(nv50_ir_prog_info);
    565    if (!info)
    566       return false;
    567 
    568    info->type = prog->type;
    569    info->target = chipset;
    570    info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
    571    info->bin.source = (void *)prog->pipe.tokens;
    572 
    573 #ifdef DEBUG
    574    info->target = debug_get_num_option("NV50_PROG_CHIPSET", chipset);
    575    info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
    576    info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
    577 #else
    578    info->optLevel = 3;
    579 #endif
    580 
    581    info->io.genUserClip = prog->vp.num_ucps;
    582    info->io.auxCBSlot = 15;
    583    info->io.msInfoCBSlot = 15;
    584    info->io.ucpBase = NVC0_CB_AUX_UCP_INFO;
    585    info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO;
    586    info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
    587    info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
    588    info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0);
    589    if (info->target >= NVISA_GK104_CHIPSET) {
    590       info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
    591       info->io.fbtexBindBase = NVC0_CB_AUX_FB_TEX_INFO;
    592    }
    593 
    594    if (prog->type == PIPE_SHADER_COMPUTE) {
    595       if (info->target >= NVISA_GK104_CHIPSET) {
    596          info->io.auxCBSlot = 7;
    597          info->io.msInfoCBSlot = 7;
    598          info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0);
    599       }
    600       info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO(0);
    601    } else {
    602       info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
    603    }
    604 
    605    info->assignSlots = nvc0_program_assign_varying_slots;
    606 
    607    ret = nv50_ir_generate_code(info);
    608    if (ret) {
    609       NOUVEAU_ERR("shader translation failed: %i\n", ret);
    610       goto out;
    611    }
    612    if (prog->type != PIPE_SHADER_COMPUTE)
    613       FREE(info->bin.syms);
    614 
    615    prog->code = info->bin.code;
    616    prog->code_size = info->bin.codeSize;
    617    prog->relocs = info->bin.relocData;
    618    prog->fixups = info->bin.fixupData;
    619    prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
    620    prog->num_barriers = info->numBarriers;
    621 
    622    prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
    623    prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters;
    624 
    625    if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
    626       info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
    627    prog->vp.edgeflag = info->io.edgeFlagIn;
    628 
    629    switch (prog->type) {
    630    case PIPE_SHADER_VERTEX:
    631       ret = nvc0_vp_gen_header(prog, info);
    632       break;
    633    case PIPE_SHADER_TESS_CTRL:
    634       ret = nvc0_tcp_gen_header(prog, info);
    635       break;
    636    case PIPE_SHADER_TESS_EVAL:
    637       ret = nvc0_tep_gen_header(prog, info);
    638       break;
    639    case PIPE_SHADER_GEOMETRY:
    640       ret = nvc0_gp_gen_header(prog, info);
    641       break;
    642    case PIPE_SHADER_FRAGMENT:
    643       ret = nvc0_fp_gen_header(prog, info);
    644       break;
    645    case PIPE_SHADER_COMPUTE:
    646       prog->cp.syms = info->bin.syms;
    647       prog->cp.num_syms = info->bin.numSyms;
    648       break;
    649    default:
    650       ret = -1;
    651       NOUVEAU_ERR("unknown program type: %u\n", prog->type);
    652       break;
    653    }
    654    if (ret)
    655       goto out;
    656 
    657    if (info->bin.tlsSpace) {
    658       assert(info->bin.tlsSpace < (1 << 24));
    659       prog->hdr[0] |= 1 << 26;
    660       prog->hdr[1] |= align(info->bin.tlsSpace, 0x10); /* l[] size */
    661       prog->need_tls = true;
    662    }
    663    /* TODO: factor 2 only needed where joinat/precont is used,
    664     *       and we only have to count non-uniform branches
    665     */
    666    /*
    667    if ((info->maxCFDepth * 2) > 16) {
    668       prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200;
    669       prog->need_tls = true;
    670    }
    671    */
    672    if (info->io.globalAccess)
    673       prog->hdr[0] |= 1 << 26;
    674    if (info->io.globalAccess & 0x2)
    675       prog->hdr[0] |= 1 << 16;
    676    if (info->io.fp64)
    677       prog->hdr[0] |= 1 << 27;
    678 
    679    if (prog->pipe.stream_output.num_outputs)
    680       prog->tfb = nvc0_program_create_tfb_state(info,
    681                                                 &prog->pipe.stream_output);
    682 
    683    pipe_debug_message(debug, SHADER_INFO,
    684                       "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
    685                       prog->type, info->bin.tlsSpace, prog->num_gprs,
    686                       info->bin.instructions, info->bin.codeSize);
    687 
    688 #ifdef DEBUG
    689    if (debug_get_option("NV50_PROG_CHIPSET", NULL) && info->dbgFlags)
    690       nvc0_program_dump(prog);
    691 #endif
    692 
    693 out:
    694    FREE(info);
    695    return !ret;
    696 }
    697 
    698 static inline int
    699 nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
    700 {
    701    struct nvc0_screen *screen = nvc0->screen;
    702    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
    703    int ret;
    704    uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
    705 
    706    /* On Fermi, SP_START_ID must be aligned to 0x40.
    707     * On Kepler, the first instruction must be aligned to 0x80 because
    708     * latency information is expected only at certain positions.
    709     */
    710    if (screen->base.class_3d >= NVE4_3D_CLASS)
    711       size = size + (is_cp ? 0x40 : 0x70);
    712    size = align(size, 0x40);
    713 
    714    ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem);
    715    if (ret)
    716       return ret;
    717    prog->code_base = prog->mem->start;
    718 
    719    if (!is_cp) {
    720       if (screen->base.class_3d >= NVE4_3D_CLASS) {
    721          switch (prog->mem->start & 0xff) {
    722          case 0x40: prog->code_base += 0x70; break;
    723          case 0x80: prog->code_base += 0x30; break;
    724          case 0xc0: prog->code_base += 0x70; break;
    725          default:
    726             prog->code_base += 0x30;
    727             assert((prog->mem->start & 0xff) == 0x00);
    728             break;
    729          }
    730       }
    731    } else {
    732       if (screen->base.class_3d >= NVE4_3D_CLASS) {
    733          if (prog->mem->start & 0x40)
    734             prog->code_base += 0x40;
    735          assert((prog->code_base & 0x7f) == 0x00);
    736       }
    737    }
    738 
    739    return 0;
    740 }
    741 
    742 static inline void
    743 nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
    744 {
    745    struct nvc0_screen *screen = nvc0->screen;
    746    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
    747    uint32_t code_pos = prog->code_base + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
    748 
    749    if (prog->relocs)
    750       nv50_ir_relocate_code(prog->relocs, prog->code, code_pos,
    751                             screen->lib_code->start, 0);
    752    if (prog->fixups) {
    753       nv50_ir_apply_fixups(prog->fixups, prog->code,
    754                            prog->fp.force_persample_interp,
    755                            prog->fp.flatshade,
    756                            0 /* alphatest */);
    757       for (int i = 0; i < 2; i++) {
    758          unsigned mask = prog->fp.color_interp[i] >> 4;
    759          unsigned interp = prog->fp.color_interp[i] & 3;
    760          if (!mask)
    761             continue;
    762          prog->hdr[14] &= ~(0xff << (8 * i));
    763          if (prog->fp.flatshade)
    764             interp = NVC0_INTERP_FLAT;
    765          for (int c = 0; c < 4; c++)
    766             if (mask & (1 << c))
    767                prog->hdr[14] |= interp << (2 * (4 * i + c));
    768       }
    769    }
    770 
    771    if (!is_cp)
    772       nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,
    773                            NV_VRAM_DOMAIN(&screen->base),
    774                            NVC0_SHADER_HEADER_SIZE, prog->hdr);
    775 
    776    nvc0->base.push_data(&nvc0->base, screen->text, code_pos,
    777                         NV_VRAM_DOMAIN(&screen->base), prog->code_size,
    778                         prog->code);
    779 }
    780 
    781 bool
    782 nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog)
    783 {
    784    struct nvc0_screen *screen = nvc0->screen;
    785    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
    786    int ret;
    787    uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
    788 
    789    ret = nvc0_program_alloc_code(nvc0, prog);
    790    if (ret) {
    791       struct nouveau_heap *heap = screen->text_heap;
    792       struct nvc0_program *progs[] = { /* Sorted accordingly to SP_START_ID */
    793          nvc0->compprog, nvc0->vertprog, nvc0->tctlprog,
    794          nvc0->tevlprog, nvc0->gmtyprog, nvc0->fragprog
    795       };
    796 
    797       /* Note that the code library, which is allocated before anything else,
    798        * does not have a priv pointer. We can stop once we hit it.
    799        */
    800       while (heap->next && heap->next->priv) {
    801          struct nvc0_program *evict = heap->next->priv;
    802          nouveau_heap_free(&evict->mem);
    803       }
    804       debug_printf("WARNING: out of code space, evicting all shaders.\n");
    805 
    806       /* Make sure to synchronize before deleting the code segment. */
    807       IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0);
    808 
    809       if ((screen->text->size << 1) <= (1 << 23)) {
    810          ret = nvc0_screen_resize_text_area(screen, screen->text->size << 1);
    811          if (ret) {
    812             NOUVEAU_ERR("Error allocating TEXT area: %d\n", ret);
    813             return false;
    814          }
    815          nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEXT);
    816          BCTX_REFN_bo(nvc0->bufctx_3d, 3D_TEXT,
    817                       NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD,
    818                       screen->text);
    819          if (screen->compute) {
    820             nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_TEXT);
    821             BCTX_REFN_bo(nvc0->bufctx_cp, CP_TEXT,
    822                          NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD,
    823                          screen->text);
    824          }
    825 
    826          /* Re-upload the builtin function into the new code segment. */
    827          nvc0_program_library_upload(nvc0);
    828       }
    829 
    830       ret = nvc0_program_alloc_code(nvc0, prog);
    831       if (ret) {
    832          NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
    833          return false;
    834       }
    835 
    836       /* All currently bound shaders have to be reuploaded. */
    837       for (int i = 0; i < ARRAY_SIZE(progs); i++) {
    838          if (!progs[i] || progs[i] == prog)
    839             continue;
    840 
    841          ret = nvc0_program_alloc_code(nvc0, progs[i]);
    842          if (ret) {
    843             NOUVEAU_ERR("failed to re-upload a shader after code eviction.\n");
    844             return false;
    845          }
    846          nvc0_program_upload_code(nvc0, progs[i]);
    847 
    848          if (progs[i]->type == PIPE_SHADER_COMPUTE) {
    849             /* Caches have to be invalidated but the CP_START_ID will be
    850              * updated in the launch_grid functions. */
    851             BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(FLUSH), 1);
    852             PUSH_DATA (nvc0->base.pushbuf, NVC0_COMPUTE_FLUSH_CODE);
    853          } else {
    854             BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(SP_START_ID(i)), 1);
    855             PUSH_DATA (nvc0->base.pushbuf, progs[i]->code_base);
    856          }
    857       }
    858    }
    859 
    860    nvc0_program_upload_code(nvc0, prog);
    861 
    862 #ifdef DEBUG
    863    if (debug_get_bool_option("NV50_PROG_DEBUG", false))
    864       nvc0_program_dump(prog);
    865 #endif
    866 
    867    BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);
    868    PUSH_DATA (nvc0->base.pushbuf, 0x1011);
    869 
    870    return true;
    871 }
    872 
    873 /* Upload code for builtin functions like integer division emulation. */
    874 void
    875 nvc0_program_library_upload(struct nvc0_context *nvc0)
    876 {
    877    struct nvc0_screen *screen = nvc0->screen;
    878    int ret;
    879    uint32_t size;
    880    const uint32_t *code;
    881 
    882    if (screen->lib_code)
    883       return;
    884 
    885    nv50_ir_get_target_library(screen->base.device->chipset, &code, &size);
    886    if (!size)
    887       return;
    888 
    889    ret = nouveau_heap_alloc(screen->text_heap, align(size, 0x100), NULL,
    890                             &screen->lib_code);
    891    if (ret)
    892       return;
    893 
    894    nvc0->base.push_data(&nvc0->base,
    895                         screen->text, screen->lib_code->start, NV_VRAM_DOMAIN(&screen->base),
    896                         size, code);
    897    /* no need for a memory barrier, will be emitted with first program */
    898 }
    899 
    900 void
    901 nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
    902 {
    903    const struct pipe_shader_state pipe = prog->pipe;
    904    const ubyte type = prog->type;
    905 
    906    if (prog->mem)
    907       nouveau_heap_free(&prog->mem);
    908    FREE(prog->code); /* may be 0 for hardcoded shaders */
    909    FREE(prog->relocs);
    910    FREE(prog->fixups);
    911    if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
    912       FREE(prog->cp.syms);
    913    if (prog->tfb) {
    914       if (nvc0->state.tfb == prog->tfb)
    915          nvc0->state.tfb = NULL;
    916       FREE(prog->tfb);
    917    }
    918 
    919    memset(prog, 0, sizeof(*prog));
    920 
    921    prog->pipe = pipe;
    922    prog->type = type;
    923 }
    924 
    925 uint32_t
    926 nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
    927 {
    928    const struct nv50_ir_prog_symbol *syms =
    929       (const struct nv50_ir_prog_symbol *)prog->cp.syms;
    930    unsigned base = 0;
    931    unsigned i;
    932    if (prog->type != PIPE_SHADER_COMPUTE)
    933       base = NVC0_SHADER_HEADER_SIZE;
    934    for (i = 0; i < prog->cp.num_syms; ++i)
    935       if (syms[i].label == label)
    936          return prog->code_base + base + syms[i].offset;
    937    return prog->code_base; /* no symbols or symbol not found */
    938 }
    939 
    940 void
    941 nvc0_program_init_tcp_empty(struct nvc0_context *nvc0)
    942 {
    943    struct ureg_program *ureg;
    944 
    945    ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
    946    if (!ureg)
    947       return;
    948 
    949    ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1);
    950    ureg_END(ureg);
    951 
    952    nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe);
    953 }
    954