Home | History | Annotate | Download | only in shader
      1 /*
      2  * Mesa 3-D graphics library
      3  *
      4  * Copyright (C) 2012-2013 LunarG, Inc.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included
     14  * in all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     22  * DEALINGS IN THE SOFTWARE.
     23  *
     24  * Authors:
     25  *    Chia-I Wu <olv (at) lunarg.com>
     26  */
     27 
     28 #include "tgsi/tgsi_dump.h"
     29 #include "toy_compiler.h"
     30 #include "toy_tgsi.h"
     31 #include "toy_legalize.h"
     32 #include "toy_optimize.h"
     33 #include "toy_helpers.h"
     34 #include "ilo_shader_internal.h"
     35 
     36 /* XXX Below is proof-of-concept code.  Skip this file! */
     37 
     38 /*
     39  * TODO
     40  * - primitive id is in r0.1.  FS receives PID as a flat attribute.
     41  * - set VUE header m0.1 for layered rendering
     42  */
     43 struct gs_compile_context {
     44    struct ilo_shader *shader;
     45    const struct ilo_shader_variant *variant;
     46    const struct pipe_stream_output_info *so_info;
     47 
     48    struct toy_compiler tc;
     49    struct toy_tgsi tgsi;
     50    int output_map[PIPE_MAX_SHADER_OUTPUTS];
     51 
     52    bool write_so;
     53    bool write_vue;
     54 
     55    int in_vue_size;
     56    int in_vue_count;
     57 
     58    int out_vue_size;
     59    int out_vue_min_count;
     60 
     61    bool is_static;
     62 
     63    struct {
     64       struct toy_src header;
     65       struct toy_src svbi;
     66       struct toy_src vues[6];
     67    } payload;
     68 
     69    struct {
     70       struct toy_dst urb_write_header;
     71       bool prim_start;
     72       bool prim_end;
     73       int prim_type;
     74 
     75       struct toy_dst tmp;
     76 
     77       /* buffered tgsi_outs */
     78       struct toy_dst buffers[3];
     79       int buffer_needed, buffer_cur;
     80 
     81       struct toy_dst so_written;
     82       struct toy_dst so_index;
     83 
     84       struct toy_src tgsi_outs[PIPE_MAX_SHADER_OUTPUTS];
     85    } vars;
     86 
     87    struct {
     88       struct toy_dst total_vertices;
     89       struct toy_dst total_prims;
     90 
     91       struct toy_dst num_vertices;
     92       struct toy_dst num_vertices_in_prim;
     93    } dynamic_data;
     94 
     95    struct {
     96       int total_vertices;
     97       int total_prims;
     98       /* this limits the max vertice count to be 256 */
     99       uint32_t last_vertex[8];
    100 
    101       int num_vertices;
    102       int num_vertices_in_prim;
    103    } static_data;
    104 
    105    int first_free_grf;
    106    int last_free_grf;
    107    int first_free_mrf;
    108    int last_free_mrf;
    109 };
    110 
    111 static void
    112 gs_COPY8(struct toy_compiler *tc, struct toy_dst dst, struct toy_src src)
    113 {
    114    struct toy_inst *inst;
    115 
    116    inst = tc_MOV(tc, dst, src);
    117    inst->exec_size = GEN6_EXECSIZE_8;
    118    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    119 }
    120 
    121 static void
    122 gs_COPY4(struct toy_compiler *tc,
    123          struct toy_dst dst, int dst_ch,
    124          struct toy_src src, int src_ch)
    125 {
    126    struct toy_inst *inst;
    127 
    128    inst = tc_MOV(tc,
    129          tdst_offset(dst, 0, dst_ch),
    130          tsrc_offset(src, 0, src_ch));
    131    inst->exec_size = GEN6_EXECSIZE_4;
    132    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    133 }
    134 
    135 static void
    136 gs_COPY1(struct toy_compiler *tc,
    137          struct toy_dst dst, int dst_ch,
    138          struct toy_src src, int src_ch)
    139 {
    140    struct toy_inst *inst;
    141 
    142    inst = tc_MOV(tc,
    143          tdst_offset(dst, 0, dst_ch),
    144          tsrc_rect(tsrc_offset(src, 0, src_ch), TOY_RECT_010));
    145    inst->exec_size = GEN6_EXECSIZE_1;
    146    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    147 }
    148 
    149 static void
    150 gs_init_vars(struct gs_compile_context *gcc)
    151 {
    152    struct toy_compiler *tc = &gcc->tc;
    153    struct toy_dst dst;
    154 
    155    /* init URB_WRITE header */
    156    dst = gcc->vars.urb_write_header;
    157 
    158    gs_COPY8(tc, dst, gcc->payload.header);
    159 
    160    gcc->vars.prim_start = true;
    161    gcc->vars.prim_end = false;
    162    switch (gcc->out_vue_min_count) {
    163    case 1:
    164       gcc->vars.prim_type = GEN6_3DPRIM_POINTLIST;
    165       break;
    166    case 2:
    167       gcc->vars.prim_type = GEN6_3DPRIM_LINESTRIP;
    168       break;
    169    case 3:
    170       gcc->vars.prim_type = GEN6_3DPRIM_TRISTRIP;
    171       break;
    172    }
    173 
    174    if (gcc->write_so)
    175       tc_MOV(tc, gcc->vars.so_written, tsrc_imm_d(0));
    176 }
    177 
    178 static void
    179 gs_save_output(struct gs_compile_context *gcc, const struct toy_src *outs)
    180 {
    181    struct toy_compiler *tc = &gcc->tc;
    182    const struct toy_dst buf = gcc->vars.buffers[gcc->vars.buffer_cur];
    183    int i;
    184 
    185    for (i = 0; i < gcc->shader->out.count; i++)
    186       tc_MOV(tc, tdst_offset(buf, i, 0), outs[i]);
    187 
    188    /* advance the cursor */
    189    gcc->vars.buffer_cur++;
    190    gcc->vars.buffer_cur %= gcc->vars.buffer_needed;
    191 }
    192 
    193 static void
    194 gs_write_so(struct gs_compile_context *gcc,
    195             struct toy_dst dst,
    196             struct toy_src index, struct toy_src out,
    197             bool send_write_commit_message,
    198             int binding_table_index)
    199 {
    200    struct toy_compiler *tc = &gcc->tc;
    201    struct toy_dst mrf_header;
    202    struct toy_src desc;
    203 
    204    mrf_header = tdst_d(tdst(TOY_FILE_MRF, gcc->first_free_mrf, 0));
    205 
    206    /* m0.5: destination index */
    207    gs_COPY1(tc, mrf_header, 5, index, 0);
    208 
    209    /* m0.0 - m0.3: RGBA */
    210    gs_COPY4(tc, mrf_header, 0, tsrc_type(out, mrf_header.type), 0);
    211 
    212    desc = tsrc_imm_mdesc_data_port(tc, false,
    213          1, send_write_commit_message,
    214          true, send_write_commit_message,
    215          GEN6_MSG_DP_SVB_WRITE, 0,
    216          binding_table_index);
    217 
    218    tc_SEND(tc, dst, tsrc_from(mrf_header), desc,
    219          GEN6_SFID_DP_RC);
    220 }
    221 
    222 static void
    223 gs_write_vue(struct gs_compile_context *gcc,
    224              struct toy_dst dst, struct toy_src msg_header,
    225              const struct toy_src *outs, int num_outs,
    226              bool eot)
    227 {
    228    struct toy_compiler *tc = &gcc->tc;
    229    struct toy_dst mrf_header;
    230    struct toy_src desc;
    231    int sent = 0;
    232 
    233    mrf_header = tdst_d(tdst(TOY_FILE_MRF, gcc->first_free_mrf, 0));
    234    gs_COPY8(tc, mrf_header, msg_header);
    235 
    236    while (sent < num_outs) {
    237       int mrf = gcc->first_free_mrf + 1;
    238       const int mrf_avail = gcc->last_free_mrf - mrf + 1;
    239       int msg_len, num_entries, i;
    240       bool complete;
    241 
    242       num_entries = (num_outs - sent + 1) / 2;
    243       complete = true;
    244       if (num_entries > mrf_avail) {
    245          num_entries = mrf_avail;
    246          complete = false;
    247       }
    248 
    249       for (i = 0; i < num_entries; i++) {
    250          gs_COPY4(tc, tdst(TOY_FILE_MRF, mrf + i / 2, 0), 0,
    251                outs[sent + 2 * i], 0);
    252          if (sent + i * 2 + 1 < gcc->shader->out.count) {
    253             gs_COPY4(tc, tdst(TOY_FILE_MRF, mrf + i / 2, 0), 4,
    254                   outs[sent + 2 * i + 1], 0);
    255          }
    256          mrf++;
    257       }
    258 
    259       /* do not forget the header */
    260       msg_len = num_entries + 1;
    261 
    262       if (complete) {
    263          desc = tsrc_imm_mdesc_urb(tc,
    264                eot, msg_len, !eot, true, true, !eot,
    265                false, sent, 0);
    266       }
    267       else {
    268          desc = tsrc_imm_mdesc_urb(tc,
    269                false, msg_len, 0, false, true, false,
    270                false, sent, 0);
    271       }
    272 
    273       tc_add2(tc, TOY_OPCODE_URB_WRITE,
    274             (complete) ? dst : tdst_null(), tsrc_from(mrf_header), desc);
    275 
    276       sent += num_entries * 2;
    277    }
    278 }
    279 
    280 static void
    281 gs_ff_sync(struct gs_compile_context *gcc, struct toy_dst dst,
    282            struct toy_src num_prims)
    283 {
    284    struct toy_compiler *tc = &gcc->tc;
    285    struct toy_dst mrf_header =
    286       tdst_d(tdst(TOY_FILE_MRF, gcc->first_free_mrf, 0));
    287    struct toy_src desc;
    288    bool allocate;
    289 
    290    gs_COPY8(tc, mrf_header, gcc->payload.header);
    291 
    292    /* set NumSOVertsToWrite and NumSOPrimsNeeded */
    293    if (gcc->write_so) {
    294       if (num_prims.file == TOY_FILE_IMM) {
    295          const uint32_t v =
    296             (num_prims.val32 * gcc->in_vue_count) << 16 | num_prims.val32;
    297 
    298          gs_COPY1(tc, mrf_header, 0, tsrc_imm_d(v), 0);
    299       }
    300       else {
    301          struct toy_dst m0_0 = tdst_d(gcc->vars.tmp);
    302 
    303          tc_MUL(tc, m0_0, num_prims, tsrc_imm_d(gcc->in_vue_count << 16));
    304          tc_OR(tc, m0_0, tsrc_from(m0_0), num_prims);
    305 
    306          gs_COPY1(tc, mrf_header, 0, tsrc_from(m0_0), 0);
    307       }
    308    }
    309 
    310    /* set NumGSPrimsGenerated */
    311    if (gcc->write_vue)
    312       gs_COPY1(tc, mrf_header, 1, num_prims, 0);
    313 
    314    /*
    315     * From the Sandy Bridge PRM, volume 2 part 1, page 173:
    316     *
    317     *     "Programming Note: If the GS stage is enabled, software must always
    318     *      allocate at least one GS URB Entry. This is true even if the GS
    319     *      thread never needs to output vertices to the pipeline, e.g., when
    320     *      only performing stream output. This is an artifact of the need to
    321     *      pass the GS thread an initial destination URB handle."
    322     */
    323    allocate = true;
    324    desc = tsrc_imm_mdesc_urb(tc, false, 1, 1,
    325          false, false, allocate,
    326          false, 0, 1);
    327 
    328    tc_SEND(tc, dst, tsrc_from(mrf_header), desc, GEN6_SFID_URB);
    329 }
    330 
    331 static void
    332 gs_discard(struct gs_compile_context *gcc)
    333 {
    334    struct toy_compiler *tc = &gcc->tc;
    335    struct toy_dst mrf_header;
    336    struct toy_src desc;
    337 
    338    mrf_header = tdst_d(tdst(TOY_FILE_MRF, gcc->first_free_mrf, 0));
    339 
    340    gs_COPY8(tc, mrf_header, tsrc_from(gcc->vars.urb_write_header));
    341 
    342    desc = tsrc_imm_mdesc_urb(tc,
    343          true, 1, 0, true, false, false,
    344          false, 0, 0);
    345 
    346    tc_add2(tc, TOY_OPCODE_URB_WRITE,
    347          tdst_null(), tsrc_from(mrf_header), desc);
    348 }
    349 
    350 static void
    351 gs_lower_opcode_endprim(struct gs_compile_context *gcc, struct toy_inst *inst)
    352 {
    353    /* if has control flow, set PrimEnd on the last vertex and URB_WRITE */
    354 }
    355 
    356 static void
    357 gs_lower_opcode_emit_vue_dynamic(struct gs_compile_context *gcc)
    358 {
    359    /* TODO similar to the static version */
    360 
    361    /*
    362     * When SO is enabled and the inputs are lines or triangles, vertices are
    363     * always buffered.  we can defer the emission of the current vertex until
    364     * the next EMIT or ENDPRIM.  Or, we can emit two URB_WRITEs with the later
    365     * patching the former.
    366     */
    367 }
    368 
    369 static void
    370 gs_lower_opcode_emit_so_dynamic(struct gs_compile_context *gcc)
    371 {
    372    struct toy_compiler *tc = &gcc->tc;
    373 
    374    tc_IF(tc, tdst_null(),
    375          tsrc_from(gcc->dynamic_data.num_vertices_in_prim),
    376          tsrc_imm_d(gcc->out_vue_min_count),
    377          GEN6_COND_GE);
    378 
    379    {
    380       tc_ADD(tc, gcc->vars.tmp, tsrc_from(gcc->vars.so_index), tsrc_imm_d(0x03020100));
    381 
    382       /* TODO same as static version */
    383    }
    384 
    385    tc_ENDIF(tc);
    386 
    387    tc_ADD(tc, gcc->vars.so_index,
    388          tsrc_from(gcc->vars.so_index), tsrc_imm_d(gcc->out_vue_min_count));
    389 }
    390 
    391 static void
    392 gs_lower_opcode_emit_vue_static(struct gs_compile_context *gcc)
    393 {
    394    struct toy_compiler *tc = &gcc->tc;
    395    struct toy_inst *inst2;
    396    bool eot;
    397 
    398    eot = (gcc->static_data.num_vertices == gcc->static_data.total_vertices);
    399 
    400    gcc->vars.prim_end =
    401       ((gcc->static_data.last_vertex[(gcc->static_data.num_vertices - 1) / 32] &
    402         1 << ((gcc->static_data.num_vertices - 1) % 32)) != 0);
    403 
    404    if (eot && gcc->write_so) {
    405       inst2 = tc_OR(tc, tdst_offset(gcc->vars.urb_write_header, 0, 2),
    406             tsrc_from(gcc->vars.so_written),
    407             tsrc_imm_d(gcc->vars.prim_type << 2 |
    408                        gcc->vars.prim_start << 1 |
    409                        gcc->vars.prim_end));
    410       inst2->exec_size = GEN6_EXECSIZE_1;
    411       inst2->src[0] = tsrc_rect(inst2->src[0], TOY_RECT_010);
    412       inst2->src[1] = tsrc_rect(inst2->src[1], TOY_RECT_010);
    413    }
    414    else {
    415       gs_COPY1(tc, gcc->vars.urb_write_header, 2,
    416             tsrc_imm_d(gcc->vars.prim_type << 2 |
    417                        gcc->vars.prim_start << 1 |
    418                        gcc->vars.prim_end), 0);
    419    }
    420 
    421    gs_write_vue(gcc, tdst_d(gcc->vars.tmp),
    422          tsrc_from(gcc->vars.urb_write_header),
    423          gcc->vars.tgsi_outs,
    424          gcc->shader->out.count, eot);
    425 
    426    if (!eot) {
    427       gs_COPY1(tc, gcc->vars.urb_write_header, 0,
    428             tsrc_from(tdst_d(gcc->vars.tmp)), 0);
    429    }
    430 
    431    gcc->vars.prim_start = gcc->vars.prim_end;
    432    gcc->vars.prim_end = false;
    433 }
    434 
    435 static void
    436 gs_lower_opcode_emit_so_static(struct gs_compile_context *gcc)
    437 {
    438    struct toy_compiler *tc = &gcc->tc;
    439    struct toy_inst *inst;
    440    int i, j;
    441 
    442    if (gcc->static_data.num_vertices_in_prim < gcc->out_vue_min_count)
    443       return;
    444 
    445    inst = tc_MOV(tc, tdst_w(gcc->vars.tmp), tsrc_imm_v(0x03020100));
    446    inst->exec_size = GEN6_EXECSIZE_8;
    447    inst->mask_ctrl = GEN6_MASKCTRL_NOMASK;
    448 
    449    tc_ADD(tc, tdst_d(gcc->vars.tmp), tsrc_from(tdst_d(gcc->vars.tmp)),
    450          tsrc_rect(tsrc_from(gcc->vars.so_index), TOY_RECT_010));
    451 
    452    tc_IF(tc, tdst_null(),
    453          tsrc_rect(tsrc_offset(tsrc_from(tdst_d(gcc->vars.tmp)), 0, gcc->out_vue_min_count - 1), TOY_RECT_010),
    454          tsrc_rect(tsrc_offset(gcc->payload.svbi, 0, 4), TOY_RECT_010),
    455          GEN6_COND_LE);
    456    {
    457       for (i = 0; i < gcc->out_vue_min_count; i++) {
    458          for (j = 0; j < gcc->so_info->num_outputs; j++) {
    459             const int idx = gcc->so_info->output[j].register_index;
    460             struct toy_src index, out;
    461             int binding_table_index;
    462             bool write_commit;
    463 
    464             index = tsrc_d(tsrc_offset(tsrc_from(gcc->vars.tmp), 0, i));
    465 
    466             if (i == gcc->out_vue_min_count - 1) {
    467                out = gcc->vars.tgsi_outs[idx];
    468             }
    469             else {
    470                /* gcc->vars.buffer_cur also points to the first vertex */
    471                const int buf =
    472                   (gcc->vars.buffer_cur + i) % gcc->vars.buffer_needed;
    473 
    474                out = tsrc_offset(tsrc_from(gcc->vars.buffers[buf]), idx, 0);
    475             }
    476 
    477             out = tsrc_offset(out, 0, gcc->so_info->output[j].start_component);
    478 
    479             /*
    480              * From the Sandy Bridge PRM, volume 4 part 2, page 19:
    481              *
    482              *     "The Kernel must do a write commit on the last write to DAP
    483              *      prior to a URB_WRITE with End of Thread."
    484              */
    485             write_commit =
    486                (gcc->static_data.num_vertices == gcc->static_data.total_vertices &&
    487                 i == gcc->out_vue_min_count - 1 &&
    488                 j == gcc->so_info->num_outputs - 1);
    489 
    490 
    491             binding_table_index = gcc->shader->bt.gen6_so_base + j;
    492 
    493             gs_write_so(gcc, gcc->vars.tmp, index,
    494                   out, write_commit, binding_table_index);
    495 
    496             /*
    497              * From the Sandy Bridge PRM, volume 4 part 1, page 168:
    498              *
    499              *     "The write commit does not modify the destination register, but
    500              *      merely clears the dependency associated with the destination
    501              *      register. Thus, a simple "mov" instruction using the register as a
    502              *      source is sufficient to wait for the write commit to occur."
    503              */
    504             if (write_commit)
    505                tc_MOV(tc, gcc->vars.tmp, tsrc_from(gcc->vars.tmp));
    506          }
    507       }
    508 
    509       /* SONumPrimsWritten occupies the higher word of m0.2 of URB_WRITE */
    510       tc_ADD(tc, gcc->vars.so_written,
    511             tsrc_from(gcc->vars.so_written), tsrc_imm_d(1 << 16));
    512       tc_ADD(tc, gcc->vars.so_index,
    513             tsrc_from(gcc->vars.so_index), tsrc_imm_d(gcc->out_vue_min_count));
    514    }
    515    tc_ENDIF(tc);
    516 }
    517 
    518 static void
    519 gs_lower_opcode_emit_static(struct gs_compile_context *gcc,
    520                             struct toy_inst *inst)
    521 {
    522    gcc->static_data.num_vertices++;
    523    gcc->static_data.num_vertices_in_prim++;
    524 
    525    if (gcc->write_so) {
    526       gs_lower_opcode_emit_so_static(gcc);
    527 
    528       if (gcc->out_vue_min_count > 1 &&
    529           gcc->static_data.num_vertices != gcc->static_data.total_vertices)
    530          gs_save_output(gcc, gcc->vars.tgsi_outs);
    531    }
    532 
    533    if (gcc->write_vue)
    534       gs_lower_opcode_emit_vue_static(gcc);
    535 }
    536 
    537 static void
    538 gs_lower_opcode_emit_dynamic(struct gs_compile_context *gcc,
    539                              struct toy_inst *inst)
    540 {
    541    struct toy_compiler *tc = &gcc->tc;
    542 
    543    tc_ADD(tc, gcc->dynamic_data.num_vertices,
    544          tsrc_from(gcc->dynamic_data.num_vertices), tsrc_imm_d(1));
    545    tc_ADD(tc, gcc->dynamic_data.num_vertices_in_prim,
    546          tsrc_from(gcc->dynamic_data.num_vertices_in_prim), tsrc_imm_d(1));
    547 
    548    if (gcc->write_so) {
    549       gs_lower_opcode_emit_so_dynamic(gcc);
    550 
    551       if (gcc->out_vue_min_count > 1)
    552          gs_save_output(gcc, gcc->vars.tgsi_outs);
    553    }
    554 
    555    if (gcc->write_vue)
    556       gs_lower_opcode_emit_vue_dynamic(gcc);
    557 }
    558 
    559 static void
    560 gs_lower_opcode_emit(struct gs_compile_context *gcc, struct toy_inst *inst)
    561 {
    562    if (gcc->is_static)
    563       gs_lower_opcode_emit_static(gcc, inst);
    564    else
    565       gs_lower_opcode_emit_dynamic(gcc, inst);
    566 }
    567 
    568 static void
    569 gs_lower_opcode_tgsi_in(struct gs_compile_context *gcc,
    570                         struct toy_dst dst, int dim, int idx)
    571 {
    572    struct toy_compiler *tc = &gcc->tc;
    573    struct toy_src attr;
    574    int slot, reg = -1, subreg;
    575 
    576    slot = toy_tgsi_find_input(&gcc->tgsi, idx);
    577    if (slot >= 0) {
    578       int i;
    579 
    580       for (i = 0; i < gcc->variant->u.gs.num_inputs; i++) {
    581          if (gcc->variant->u.gs.semantic_names[i] ==
    582                gcc->tgsi.inputs[slot].semantic_name &&
    583                gcc->variant->u.gs.semantic_indices[i] ==
    584                gcc->tgsi.inputs[slot].semantic_index) {
    585             reg = i / 2;
    586             subreg = (i % 2) * 4;
    587             break;
    588          }
    589       }
    590    }
    591 
    592    if (reg < 0) {
    593       tc_MOV(tc, dst, tsrc_imm_f(0.0f));
    594       return;
    595    }
    596 
    597    /* fix vertex ordering for GEN6_3DPRIM_TRISTRIP_REVERSE */
    598    if (gcc->in_vue_count == 3 && dim < 2) {
    599       struct toy_inst *inst;
    600 
    601       /* get PrimType */
    602       inst = tc_AND(tc, tdst_d(gcc->vars.tmp),
    603             tsrc_offset(gcc->payload.header, 0, 2), tsrc_imm_d(0x1f));
    604       inst->exec_size = GEN6_EXECSIZE_1;
    605       inst->src[0] = tsrc_rect(inst->src[0], TOY_RECT_010);
    606       inst->src[1] = tsrc_rect(inst->src[1], TOY_RECT_010);
    607 
    608       inst = tc_CMP(tc, tdst_null(), tsrc_from(tdst_d(gcc->vars.tmp)),
    609             tsrc_imm_d(GEN6_3DPRIM_TRISTRIP_REVERSE), GEN6_COND_NZ);
    610       inst->src[0] = tsrc_rect(inst->src[0], TOY_RECT_010);
    611 
    612       attr = tsrc_offset(gcc->payload.vues[dim], reg, subreg);
    613       inst = tc_MOV(tc, dst, attr);
    614       inst->pred_ctrl = GEN6_PREDCTRL_NORMAL;
    615 
    616       /* swap IN[0] and IN[1] for GEN6_3DPRIM_TRISTRIP_REVERSE */
    617       dim = !dim;
    618 
    619       attr = tsrc_offset(gcc->payload.vues[dim], reg, subreg);
    620       inst = tc_MOV(tc, dst, attr);
    621       inst->pred_ctrl = GEN6_PREDCTRL_NORMAL;
    622       inst->pred_inv = true;
    623    }
    624    else {
    625       attr = tsrc_offset(gcc->payload.vues[dim], reg, subreg);
    626       tc_MOV(tc, dst, attr);
    627    }
    628 
    629 
    630 }
    631 
    632 static void
    633 gs_lower_opcode_tgsi_imm(struct gs_compile_context *gcc,
    634                          struct toy_dst dst, int idx)
    635 {
    636    const uint32_t *imm;
    637    int ch;
    638 
    639    imm = toy_tgsi_get_imm(&gcc->tgsi, idx, NULL);
    640 
    641    for (ch = 0; ch < 4; ch++) {
    642       struct toy_inst *inst;
    643 
    644       /* raw moves */
    645       inst = tc_MOV(&gcc->tc,
    646             tdst_writemask(tdst_ud(dst), 1 << ch),
    647             tsrc_imm_ud(imm[ch]));
    648       inst->access_mode = GEN6_ALIGN_16;
    649    }
    650 }
    651 
    652 static void
    653 gs_lower_opcode_tgsi_direct(struct gs_compile_context *gcc,
    654                             struct toy_inst *inst)
    655 {
    656    struct toy_compiler *tc = &gcc->tc;
    657    int dim, idx;
    658 
    659    assert(inst->src[0].file == TOY_FILE_IMM);
    660    dim = inst->src[0].val32;
    661 
    662    assert(inst->src[1].file == TOY_FILE_IMM);
    663    idx = inst->src[1].val32;
    664 
    665    switch (inst->opcode) {
    666    case TOY_OPCODE_TGSI_IN:
    667       gs_lower_opcode_tgsi_in(gcc, inst->dst, dim, idx);
    668       /* fetch all dimensions */
    669       if (dim == 0) {
    670          int i;
    671 
    672          for (i = 1; i < gcc->in_vue_count; i++) {
    673             const int vrf = toy_tgsi_get_vrf(&gcc->tgsi, TGSI_FILE_INPUT, i, idx);
    674             struct toy_dst dst;
    675 
    676             if (vrf < 0)
    677                continue;
    678 
    679             dst = tdst(TOY_FILE_VRF, vrf, 0);
    680             gs_lower_opcode_tgsi_in(gcc, dst, i, idx);
    681          }
    682       }
    683       break;
    684    case TOY_OPCODE_TGSI_IMM:
    685       assert(!dim);
    686       gs_lower_opcode_tgsi_imm(gcc, inst->dst, idx);
    687       break;
    688    case TOY_OPCODE_TGSI_CONST:
    689    case TOY_OPCODE_TGSI_SV:
    690    default:
    691       tc_fail(tc, "unhandled TGSI fetch");
    692       break;
    693    }
    694 
    695    tc_discard_inst(tc, inst);
    696 }
    697 
    698 static void
    699 gs_lower_virtual_opcodes(struct gs_compile_context *gcc)
    700 {
    701    struct toy_compiler *tc = &gcc->tc;
    702    struct toy_inst *inst;
    703 
    704    tc_head(tc);
    705    while ((inst = tc_next(tc)) != NULL) {
    706       switch (inst->opcode) {
    707       case TOY_OPCODE_TGSI_IN:
    708       case TOY_OPCODE_TGSI_CONST:
    709       case TOY_OPCODE_TGSI_SV:
    710       case TOY_OPCODE_TGSI_IMM:
    711          gs_lower_opcode_tgsi_direct(gcc, inst);
    712          break;
    713       case TOY_OPCODE_TGSI_INDIRECT_FETCH:
    714       case TOY_OPCODE_TGSI_INDIRECT_STORE:
    715          /* TODO similar to VS */
    716          tc_fail(tc, "no indirection support");
    717          tc_discard_inst(tc, inst);
    718          break;
    719       case TOY_OPCODE_TGSI_TEX:
    720       case TOY_OPCODE_TGSI_TXB:
    721       case TOY_OPCODE_TGSI_TXD:
    722       case TOY_OPCODE_TGSI_TXL:
    723       case TOY_OPCODE_TGSI_TXP:
    724       case TOY_OPCODE_TGSI_TXF:
    725       case TOY_OPCODE_TGSI_TXQ:
    726       case TOY_OPCODE_TGSI_TXQ_LZ:
    727       case TOY_OPCODE_TGSI_TEX2:
    728       case TOY_OPCODE_TGSI_TXB2:
    729       case TOY_OPCODE_TGSI_TXL2:
    730       case TOY_OPCODE_TGSI_SAMPLE:
    731       case TOY_OPCODE_TGSI_SAMPLE_I:
    732       case TOY_OPCODE_TGSI_SAMPLE_I_MS:
    733       case TOY_OPCODE_TGSI_SAMPLE_B:
    734       case TOY_OPCODE_TGSI_SAMPLE_C:
    735       case TOY_OPCODE_TGSI_SAMPLE_C_LZ:
    736       case TOY_OPCODE_TGSI_SAMPLE_D:
    737       case TOY_OPCODE_TGSI_SAMPLE_L:
    738       case TOY_OPCODE_TGSI_GATHER4:
    739       case TOY_OPCODE_TGSI_SVIEWINFO:
    740       case TOY_OPCODE_TGSI_SAMPLE_POS:
    741       case TOY_OPCODE_TGSI_SAMPLE_INFO:
    742          /* TODO similar to VS */
    743          tc_fail(tc, "no sampling support");
    744          tc_discard_inst(tc, inst);
    745          break;
    746       case TOY_OPCODE_EMIT:
    747          gs_lower_opcode_emit(gcc, inst);
    748          tc_discard_inst(tc, inst);
    749          break;
    750       case TOY_OPCODE_ENDPRIM:
    751          gs_lower_opcode_endprim(gcc, inst);
    752          tc_discard_inst(tc, inst);
    753          break;
    754       default:
    755          break;
    756       }
    757    }
    758 
    759    tc_head(tc);
    760    while ((inst = tc_next(tc)) != NULL) {
    761       switch (inst->opcode) {
    762       case TOY_OPCODE_INV:
    763       case TOY_OPCODE_LOG:
    764       case TOY_OPCODE_EXP:
    765       case TOY_OPCODE_SQRT:
    766       case TOY_OPCODE_RSQ:
    767       case TOY_OPCODE_SIN:
    768       case TOY_OPCODE_COS:
    769       case TOY_OPCODE_FDIV:
    770       case TOY_OPCODE_POW:
    771       case TOY_OPCODE_INT_DIV_QUOTIENT:
    772       case TOY_OPCODE_INT_DIV_REMAINDER:
    773          toy_compiler_lower_math(tc, inst);
    774          break;
    775       case TOY_OPCODE_URB_WRITE:
    776          toy_compiler_lower_to_send(tc, inst, false, GEN6_SFID_URB);
    777          break;
    778       default:
    779          if (inst->opcode > 127)
    780             tc_fail(tc, "unhandled virtual opcode");
    781          break;
    782       }
    783    }
    784 }
    785 
    786 /**
    787  * Get the number of (tessellated) primitives generated by this shader.
    788  * Return false if that is unknown until runtime.
    789  */
    790 static void
    791 get_num_prims_static(struct gs_compile_context *gcc)
    792 {
    793    struct toy_compiler *tc = &gcc->tc;
    794    const struct toy_inst *inst;
    795    int num_vertices_in_prim = 0, if_depth = 0, do_depth = 0;
    796    bool is_static = true;
    797 
    798    tc_head(tc);
    799    while ((inst = tc_next_no_skip(tc)) != NULL) {
    800       switch (inst->opcode) {
    801       case GEN6_OPCODE_IF:
    802          if_depth++;
    803          break;
    804       case GEN6_OPCODE_ENDIF:
    805          if_depth--;
    806          break;
    807       case TOY_OPCODE_DO:
    808          do_depth++;
    809          break;
    810       case GEN6_OPCODE_WHILE:
    811          do_depth--;
    812          break;
    813       case TOY_OPCODE_EMIT:
    814          if (if_depth || do_depth) {
    815             is_static = false;
    816          }
    817          else {
    818             gcc->static_data.total_vertices++;
    819 
    820             num_vertices_in_prim++;
    821             if (num_vertices_in_prim >= gcc->out_vue_min_count)
    822                gcc->static_data.total_prims++;
    823          }
    824          break;
    825       case TOY_OPCODE_ENDPRIM:
    826          if (if_depth || do_depth) {
    827             is_static = false;
    828          }
    829          else {
    830             const int vertidx = gcc->static_data.total_vertices - 1;
    831             const int idx = vertidx / 32;
    832             const int subidx = vertidx % 32;
    833 
    834             gcc->static_data.last_vertex[idx] |= 1 << subidx;
    835             num_vertices_in_prim = 0;
    836          }
    837          break;
    838       default:
    839          break;
    840       }
    841 
    842       if (!is_static)
    843          break;
    844    }
    845 
    846    gcc->is_static = is_static;
    847 }
    848 
    849 /**
    850  * Compile the shader.
    851  */
    852 static bool
    853 gs_compile(struct gs_compile_context *gcc)
    854 {
    855    struct toy_compiler *tc = &gcc->tc;
    856    struct ilo_shader *sh = gcc->shader;
    857 
    858    get_num_prims_static(gcc);
    859 
    860    if (gcc->is_static) {
    861       tc_head(tc);
    862 
    863       gs_init_vars(gcc);
    864       gs_ff_sync(gcc, tdst_d(gcc->vars.tmp), tsrc_imm_d(gcc->static_data.total_prims));
    865       gs_COPY1(tc, gcc->vars.urb_write_header, 0, tsrc_from(tdst_d(gcc->vars.tmp)), 0);
    866       if (gcc->write_so)
    867          gs_COPY4(tc, gcc->vars.so_index, 0, tsrc_from(tdst_d(gcc->vars.tmp)), 1);
    868 
    869       tc_tail(tc);
    870    }
    871    else {
    872       tc_fail(tc, "no control flow support");
    873       return false;
    874    }
    875 
    876    if (!gcc->write_vue)
    877       gs_discard(gcc);
    878 
    879    gs_lower_virtual_opcodes(gcc);
    880    toy_compiler_legalize_for_ra(tc);
    881    toy_compiler_optimize(tc);
    882    toy_compiler_allocate_registers(tc,
    883          gcc->first_free_grf,
    884          gcc->last_free_grf,
    885          1);
    886    toy_compiler_legalize_for_asm(tc);
    887 
    888    if (tc->fail) {
    889       ilo_err("failed to legalize GS instructions: %s\n", tc->reason);
    890       return false;
    891    }
    892 
    893    if (ilo_debug & ILO_DEBUG_GS) {
    894       ilo_printf("legalized instructions:\n");
    895       toy_compiler_dump(tc);
    896       ilo_printf("\n");
    897    }
    898 
    899    sh->kernel = toy_compiler_assemble(tc, &sh->kernel_size);
    900    if (!sh->kernel)
    901       return false;
    902 
    903    if (ilo_debug & ILO_DEBUG_GS) {
    904       ilo_printf("disassembly:\n");
    905       toy_compiler_disassemble(tc->dev, sh->kernel, sh->kernel_size, false);
    906       ilo_printf("\n");
    907    }
    908 
    909    return true;
    910 }
    911 
    912 static bool
    913 gs_compile_passthrough(struct gs_compile_context *gcc)
    914 {
    915    struct toy_compiler *tc = &gcc->tc;
    916    struct ilo_shader *sh = gcc->shader;
    917 
    918    gcc->is_static = true;
    919    gcc->static_data.total_vertices = gcc->in_vue_count;
    920    gcc->static_data.total_prims = 1;
    921    gcc->static_data.last_vertex[0] = 1 << (gcc->in_vue_count - 1);
    922 
    923    gs_init_vars(gcc);
    924    gs_ff_sync(gcc, tdst_d(gcc->vars.tmp), tsrc_imm_d(gcc->static_data.total_prims));
    925    gs_COPY1(tc, gcc->vars.urb_write_header, 0, tsrc_from(tdst_d(gcc->vars.tmp)), 0);
    926    if (gcc->write_so)
    927       gs_COPY4(tc, gcc->vars.so_index, 0, tsrc_from(tdst_d(gcc->vars.tmp)), 1);
    928 
    929    {
    930       int vert, attr;
    931 
    932       for (vert = 0; vert < gcc->out_vue_min_count; vert++) {
    933          for (attr = 0; attr < gcc->shader->out.count; attr++) {
    934             tc_MOV(tc, tdst_from(gcc->vars.tgsi_outs[attr]),
    935                   tsrc_offset(gcc->payload.vues[vert], attr / 2, (attr % 2) * 4));
    936          }
    937 
    938          gs_lower_opcode_emit(gcc, NULL);
    939       }
    940 
    941       gs_lower_opcode_endprim(gcc, NULL);
    942    }
    943 
    944    if (!gcc->write_vue)
    945       gs_discard(gcc);
    946 
    947    gs_lower_virtual_opcodes(gcc);
    948 
    949    toy_compiler_legalize_for_ra(tc);
    950    toy_compiler_optimize(tc);
    951    toy_compiler_allocate_registers(tc,
    952          gcc->first_free_grf,
    953          gcc->last_free_grf,
    954          1);
    955 
    956    toy_compiler_legalize_for_asm(tc);
    957 
    958    if (tc->fail) {
    959       ilo_err("failed to translate GS TGSI tokens: %s\n", tc->reason);
    960       return false;
    961    }
    962 
    963    if (ilo_debug & ILO_DEBUG_GS) {
    964       int i;
    965 
    966       ilo_printf("VUE count %d, VUE size %d\n",
    967             gcc->in_vue_count, gcc->in_vue_size);
    968       ilo_printf("%srasterizer discard\n",
    969             (gcc->variant->u.gs.rasterizer_discard) ? "" : "no ");
    970 
    971       for (i = 0; i < gcc->so_info->num_outputs; i++) {
    972          ilo_printf("SO[%d] = OUT[%d]\n", i,
    973                gcc->so_info->output[i].register_index);
    974       }
    975 
    976       ilo_printf("legalized instructions:\n");
    977       toy_compiler_dump(tc);
    978       ilo_printf("\n");
    979    }
    980 
    981    sh->kernel = toy_compiler_assemble(tc, &sh->kernel_size);
    982    if (!sh->kernel) {
    983       ilo_err("failed to compile GS: %s\n", tc->reason);
    984       return false;
    985    }
    986 
    987    if (ilo_debug & ILO_DEBUG_GS) {
    988       ilo_printf("disassembly:\n");
    989       toy_compiler_disassemble(tc->dev, sh->kernel, sh->kernel_size, false);
    990       ilo_printf("\n");
    991    }
    992 
    993    return true;
    994 }
    995 
    996 /**
    997  * Translate the TGSI tokens.
    998  */
    999 static bool
   1000 gs_setup_tgsi(struct toy_compiler *tc, const struct tgsi_token *tokens,
   1001               struct toy_tgsi *tgsi)
   1002 {
   1003    if (ilo_debug & ILO_DEBUG_GS) {
   1004       ilo_printf("dumping geometry shader\n");
   1005       ilo_printf("\n");
   1006 
   1007       tgsi_dump(tokens, 0);
   1008       ilo_printf("\n");
   1009    }
   1010 
   1011    toy_compiler_translate_tgsi(tc, tokens, true, tgsi);
   1012    if (tc->fail)
   1013       return false;
   1014 
   1015    if (ilo_debug & ILO_DEBUG_GS) {
   1016       ilo_printf("TGSI translator:\n");
   1017       toy_tgsi_dump(tgsi);
   1018       ilo_printf("\n");
   1019       toy_compiler_dump(tc);
   1020       ilo_printf("\n");
   1021    }
   1022 
   1023    return true;
   1024 }
   1025 
   1026 /**
   1027  * Set up shader inputs for fixed-function units.
   1028  */
   1029 static void
   1030 gs_setup_shader_in(struct ilo_shader *sh,
   1031                    const struct ilo_shader_variant *variant)
   1032 {
   1033    int i;
   1034 
   1035    for (i = 0; i < variant->u.gs.num_inputs; i++) {
   1036       sh->in.semantic_names[i] = variant->u.gs.semantic_names[i];
   1037       sh->in.semantic_indices[i] = variant->u.gs.semantic_indices[i];
   1038       sh->in.interp[i] = TGSI_INTERPOLATE_CONSTANT;
   1039       sh->in.centroid[i] = false;
   1040    }
   1041 
   1042    sh->in.count = variant->u.gs.num_inputs;
   1043 
   1044    sh->in.has_pos = false;
   1045    sh->in.has_linear_interp = false;
   1046    sh->in.barycentric_interpolation_mode = 0;
   1047 }
   1048 
   1049 /**
   1050  * Set up shader outputs for fixed-function units.
   1051  *
   1052  * XXX share the code with VS
   1053  */
   1054 static void
   1055 gs_setup_shader_out(struct ilo_shader *sh, const struct toy_tgsi *tgsi,
   1056                     bool output_clipdist, int *output_map)
   1057 {
   1058    int psize_slot = -1, pos_slot = -1;
   1059    int clipdist_slot[2] = { -1, -1 };
   1060    int color_slot[4] = { -1, -1, -1, -1 };
   1061    int num_outs, i;
   1062 
   1063    /* find out the slots of outputs that need special care */
   1064    for (i = 0; i < tgsi->num_outputs; i++) {
   1065       switch (tgsi->outputs[i].semantic_name) {
   1066       case TGSI_SEMANTIC_PSIZE:
   1067          psize_slot = i;
   1068          break;
   1069       case TGSI_SEMANTIC_POSITION:
   1070          pos_slot = i;
   1071          break;
   1072       case TGSI_SEMANTIC_CLIPDIST:
   1073          if (tgsi->outputs[i].semantic_index)
   1074             clipdist_slot[1] = i;
   1075          else
   1076             clipdist_slot[0] = i;
   1077          break;
   1078       case TGSI_SEMANTIC_COLOR:
   1079          if (tgsi->outputs[i].semantic_index)
   1080             color_slot[2] = i;
   1081          else
   1082             color_slot[0] = i;
   1083          break;
   1084       case TGSI_SEMANTIC_BCOLOR:
   1085          if (tgsi->outputs[i].semantic_index)
   1086             color_slot[3] = i;
   1087          else
   1088             color_slot[1] = i;
   1089          break;
   1090       default:
   1091          break;
   1092       }
   1093    }
   1094 
   1095    /* the first two VUEs are always PSIZE and POSITION */
   1096    num_outs = 2;
   1097    output_map[0] = psize_slot;
   1098    output_map[1] = pos_slot;
   1099 
   1100    sh->out.register_indices[0] =
   1101       (psize_slot >= 0) ? tgsi->outputs[psize_slot].index : -1;
   1102    sh->out.semantic_names[0] = TGSI_SEMANTIC_PSIZE;
   1103    sh->out.semantic_indices[0] = 0;
   1104 
   1105    sh->out.register_indices[1] =
   1106       (pos_slot >= 0) ? tgsi->outputs[pos_slot].index : -1;
   1107    sh->out.semantic_names[1] = TGSI_SEMANTIC_POSITION;
   1108    sh->out.semantic_indices[1] = 0;
   1109 
   1110    sh->out.has_pos = true;
   1111 
   1112    /* followed by optional clip distances */
   1113    if (output_clipdist) {
   1114       sh->out.register_indices[num_outs] =
   1115          (clipdist_slot[0] >= 0) ? tgsi->outputs[clipdist_slot[0]].index : -1;
   1116       sh->out.semantic_names[num_outs] = TGSI_SEMANTIC_CLIPDIST;
   1117       sh->out.semantic_indices[num_outs] = 0;
   1118       output_map[num_outs++] = clipdist_slot[0];
   1119 
   1120       sh->out.register_indices[num_outs] =
   1121          (clipdist_slot[1] >= 0) ? tgsi->outputs[clipdist_slot[1]].index : -1;
   1122       sh->out.semantic_names[num_outs] = TGSI_SEMANTIC_CLIPDIST;
   1123       sh->out.semantic_indices[num_outs] = 1;
   1124       output_map[num_outs++] = clipdist_slot[1];
   1125    }
   1126 
   1127    /*
   1128     * make BCOLOR follow COLOR so that we can make use of
   1129     * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING in 3DSTATE_SF
   1130     */
   1131    for (i = 0; i < 4; i++) {
   1132       const int slot = color_slot[i];
   1133 
   1134       if (slot < 0)
   1135          continue;
   1136 
   1137       sh->out.register_indices[num_outs] = tgsi->outputs[slot].index;
   1138       sh->out.semantic_names[num_outs] = tgsi->outputs[slot].semantic_name;
   1139       sh->out.semantic_indices[num_outs] = tgsi->outputs[slot].semantic_index;
   1140 
   1141       output_map[num_outs++] = slot;
   1142    }
   1143 
   1144    /* add the rest of the outputs */
   1145    for (i = 0; i < tgsi->num_outputs; i++) {
   1146       switch (tgsi->outputs[i].semantic_name) {
   1147       case TGSI_SEMANTIC_PSIZE:
   1148       case TGSI_SEMANTIC_POSITION:
   1149       case TGSI_SEMANTIC_CLIPDIST:
   1150       case TGSI_SEMANTIC_COLOR:
   1151       case TGSI_SEMANTIC_BCOLOR:
   1152          break;
   1153       default:
   1154          sh->out.register_indices[num_outs] = tgsi->outputs[i].index;
   1155          sh->out.semantic_names[num_outs] = tgsi->outputs[i].semantic_name;
   1156          sh->out.semantic_indices[num_outs] = tgsi->outputs[i].semantic_index;
   1157          output_map[num_outs++] = i;
   1158          break;
   1159       }
   1160    }
   1161 
   1162    sh->out.count = num_outs;
   1163 }
   1164 
   1165 static void
   1166 gs_setup_vars(struct gs_compile_context *gcc)
   1167 {
   1168    int grf = gcc->first_free_grf;
   1169    int i;
   1170 
   1171    gcc->vars.urb_write_header = tdst_d(tdst(TOY_FILE_GRF, grf, 0));
   1172    grf++;
   1173 
   1174    gcc->vars.tmp = tdst(TOY_FILE_GRF, grf, 0);
   1175    grf++;
   1176 
   1177    if (gcc->write_so) {
   1178       gcc->vars.buffer_needed = gcc->out_vue_min_count - 1;
   1179       for (i = 0; i < gcc->vars.buffer_needed; i++) {
   1180          gcc->vars.buffers[i] = tdst(TOY_FILE_GRF, grf, 0);
   1181          grf += gcc->shader->out.count;
   1182       }
   1183 
   1184       gcc->vars.so_written = tdst_d(tdst(TOY_FILE_GRF, grf, 0));
   1185       grf++;
   1186 
   1187       gcc->vars.so_index = tdst_d(tdst(TOY_FILE_GRF, grf, 0));
   1188       grf++;
   1189    }
   1190 
   1191    gcc->first_free_grf = grf;
   1192 
   1193    if (!gcc->tgsi.reg_mapping) {
   1194       for (i = 0; i < gcc->shader->out.count; i++)
   1195          gcc->vars.tgsi_outs[i] = tsrc(TOY_FILE_GRF, grf++, 0);
   1196 
   1197       gcc->first_free_grf = grf;
   1198       return;
   1199    }
   1200 
   1201    for (i = 0; i < gcc->shader->out.count; i++) {
   1202       const int slot = gcc->output_map[i];
   1203       const int vrf = (slot >= 0) ? toy_tgsi_get_vrf(&gcc->tgsi,
   1204             TGSI_FILE_OUTPUT, 0, gcc->tgsi.outputs[slot].index) : -1;
   1205 
   1206       if (vrf >= 0)
   1207          gcc->vars.tgsi_outs[i] = tsrc(TOY_FILE_VRF, vrf, 0);
   1208       else
   1209          gcc->vars.tgsi_outs[i] = (i == 0) ? tsrc_imm_d(0) : tsrc_imm_f(0.0f);
   1210    }
   1211 }
   1212 
   1213 static void
   1214 gs_setup_payload(struct gs_compile_context *gcc)
   1215 {
   1216    int grf, i;
   1217 
   1218    grf = 0;
   1219 
   1220    /* r0: payload header */
   1221    gcc->payload.header = tsrc_d(tsrc(TOY_FILE_GRF, grf, 0));
   1222    grf++;
   1223 
   1224    /* r1: SVBI */
   1225    if (gcc->write_so) {
   1226       gcc->payload.svbi = tsrc_ud(tsrc(TOY_FILE_GRF, grf, 0));
   1227       grf++;
   1228    }
   1229 
   1230    /* URB data */
   1231    gcc->shader->in.start_grf = grf;
   1232 
   1233    /* no pull constants */
   1234 
   1235    /* VUEs */
   1236    for (i = 0; i < gcc->in_vue_count; i++) {
   1237       gcc->payload.vues[i] = tsrc(TOY_FILE_GRF, grf, 0);
   1238       grf += gcc->in_vue_size;
   1239    }
   1240 
   1241    gcc->first_free_grf = grf;
   1242    gcc->last_free_grf = 127;
   1243 }
   1244 
   1245 /**
   1246  * Set up GS compile context.  This includes translating the TGSI tokens.
   1247  */
   1248 static bool
   1249 gs_setup(struct gs_compile_context *gcc,
   1250          const struct ilo_shader_state *state,
   1251          const struct ilo_shader_variant *variant,
   1252          int num_verts)
   1253 {
   1254    memset(gcc, 0, sizeof(*gcc));
   1255 
   1256    gcc->shader = CALLOC_STRUCT(ilo_shader);
   1257    if (!gcc->shader)
   1258       return false;
   1259 
   1260    gcc->variant = variant;
   1261    gcc->so_info = &state->info.stream_output;
   1262 
   1263    toy_compiler_init(&gcc->tc, state->info.dev);
   1264 
   1265    gcc->write_so = (state->info.stream_output.num_outputs > 0);
   1266    gcc->write_vue = !gcc->variant->u.gs.rasterizer_discard;
   1267 
   1268    gcc->tc.templ.access_mode = GEN6_ALIGN_16;
   1269    gcc->tc.templ.exec_size = GEN6_EXECSIZE_4;
   1270    gcc->tc.rect_linear_width = 4;
   1271 
   1272    if (state->info.tokens) {
   1273       if (!gs_setup_tgsi(&gcc->tc, state->info.tokens, &gcc->tgsi)) {
   1274          toy_compiler_cleanup(&gcc->tc);
   1275          FREE(gcc->shader);
   1276          return false;
   1277       }
   1278 
   1279       switch (gcc->tgsi.props.gs_input_prim) {
   1280       case PIPE_PRIM_POINTS:
   1281          gcc->in_vue_count = 1;
   1282          break;
   1283       case PIPE_PRIM_LINES:
   1284          gcc->in_vue_count = 2;
   1285          gcc->shader->in.discard_adj = true;
   1286          break;
   1287       case PIPE_PRIM_TRIANGLES:
   1288          gcc->in_vue_count = 3;
   1289          gcc->shader->in.discard_adj = true;
   1290          break;
   1291       case PIPE_PRIM_LINES_ADJACENCY:
   1292          gcc->in_vue_count = 4;
   1293          break;
   1294       case PIPE_PRIM_TRIANGLES_ADJACENCY:
   1295          gcc->in_vue_count = 6;
   1296          break;
   1297       default:
   1298          tc_fail(&gcc->tc, "unsupported GS input type");
   1299          gcc->in_vue_count = 0;
   1300          break;
   1301       }
   1302 
   1303       switch (gcc->tgsi.props.gs_output_prim) {
   1304       case PIPE_PRIM_POINTS:
   1305          gcc->out_vue_min_count = 1;
   1306          break;
   1307       case PIPE_PRIM_LINE_STRIP:
   1308          gcc->out_vue_min_count = 2;
   1309          break;
   1310       case PIPE_PRIM_TRIANGLE_STRIP:
   1311          gcc->out_vue_min_count = 3;
   1312          break;
   1313       default:
   1314          tc_fail(&gcc->tc, "unsupported GS output type");
   1315          gcc->out_vue_min_count = 0;
   1316          break;
   1317       }
   1318    }
   1319    else {
   1320       int i;
   1321 
   1322       gcc->in_vue_count = num_verts;
   1323       gcc->out_vue_min_count = num_verts;
   1324 
   1325       gcc->tgsi.num_outputs = gcc->variant->u.gs.num_inputs;
   1326       for (i = 0; i < gcc->variant->u.gs.num_inputs; i++) {
   1327          gcc->tgsi.outputs[i].semantic_name =
   1328             gcc->variant->u.gs.semantic_names[i];
   1329          gcc->tgsi.outputs[i].semantic_index =
   1330             gcc->variant->u.gs.semantic_indices[i];
   1331       }
   1332    }
   1333 
   1334    gcc->tc.templ.access_mode = GEN6_ALIGN_1;
   1335 
   1336    gs_setup_shader_in(gcc->shader, gcc->variant);
   1337    gs_setup_shader_out(gcc->shader, &gcc->tgsi, false, gcc->output_map);
   1338 
   1339    gcc->in_vue_size = (gcc->shader->in.count + 1) / 2;
   1340 
   1341    gcc->out_vue_size = (gcc->shader->out.count + 1) / 2;
   1342 
   1343    gs_setup_payload(gcc);
   1344    gs_setup_vars(gcc);
   1345 
   1346    /* m0 is reserved for system routines */
   1347    gcc->first_free_mrf = 1;
   1348    gcc->last_free_mrf = 15;
   1349 
   1350    gcc->shader->bt.gen6_so_base = 0;
   1351    gcc->shader->bt.gen6_so_count = gcc->so_info->num_outputs;
   1352 
   1353    gcc->shader->bt.total_count = gcc->shader->bt.gen6_so_count;
   1354 
   1355    return true;
   1356 }
   1357 
   1358 /**
   1359  * Compile the geometry shader.
   1360  */
   1361 struct ilo_shader *
   1362 ilo_shader_compile_gs(const struct ilo_shader_state *state,
   1363                       const struct ilo_shader_variant *variant)
   1364 {
   1365    struct gs_compile_context gcc;
   1366 
   1367    if (!gs_setup(&gcc, state, variant, 0))
   1368       return NULL;
   1369 
   1370    if (!gs_compile(&gcc)) {
   1371       FREE(gcc.shader);
   1372       gcc.shader = NULL;
   1373    }
   1374 
   1375    toy_tgsi_cleanup(&gcc.tgsi);
   1376    toy_compiler_cleanup(&gcc.tc);
   1377 
   1378    return gcc.shader;
   1379 }
   1380 
   1381 static bool
   1382 append_gs_to_vs(struct ilo_shader *vs, struct ilo_shader *gs, int num_verts)
   1383 {
   1384    void *combined;
   1385    int gs_offset;
   1386 
   1387    if (!gs)
   1388       return false;
   1389 
   1390    /* kernels must be aligned to 64-byte */
   1391    gs_offset = align(vs->kernel_size, 64);
   1392    combined = REALLOC(vs->kernel, vs->kernel_size,
   1393          gs_offset + gs->kernel_size);
   1394    if (!combined)
   1395       return false;
   1396 
   1397    memcpy(combined + gs_offset, gs->kernel, gs->kernel_size);
   1398 
   1399    vs->kernel = combined;
   1400    vs->kernel_size = gs_offset + gs->kernel_size;
   1401 
   1402    vs->stream_output = true;
   1403    vs->gs_offsets[num_verts - 1] = gs_offset;
   1404    vs->gs_start_grf = gs->in.start_grf;
   1405    vs->gs_bt_so_count = gs->bt.gen6_so_count;
   1406 
   1407    ilo_shader_destroy_kernel(gs);
   1408 
   1409    return true;
   1410 }
   1411 
   1412 bool
   1413 ilo_shader_compile_gs_passthrough(const struct ilo_shader_state *vs_state,
   1414                                   const struct ilo_shader_variant *vs_variant,
   1415                                   const int *so_mapping,
   1416                                   struct ilo_shader *vs)
   1417 {
   1418    struct gs_compile_context gcc;
   1419    struct ilo_shader_state state;
   1420    struct ilo_shader_variant variant;
   1421    const int num_verts = 3;
   1422    int i;
   1423 
   1424    /* init GS state and variant */
   1425    state = *vs_state;
   1426    state.info.tokens = NULL;
   1427    for (i = 0; i < state.info.stream_output.num_outputs; i++) {
   1428       const int reg = state.info.stream_output.output[i].register_index;
   1429 
   1430       state.info.stream_output.output[i].register_index = so_mapping[reg];
   1431    }
   1432 
   1433    variant = *vs_variant;
   1434    variant.u.gs.rasterizer_discard = vs_variant->u.vs.rasterizer_discard;
   1435    variant.u.gs.num_inputs = vs->out.count;
   1436    for (i = 0; i < vs->out.count; i++) {
   1437       variant.u.gs.semantic_names[i] =
   1438          vs->out.semantic_names[i];
   1439       variant.u.gs.semantic_indices[i] =
   1440          vs->out.semantic_indices[i];
   1441    }
   1442 
   1443    if (!gs_setup(&gcc, &state, &variant, num_verts))
   1444       return false;
   1445 
   1446    if (!gs_compile_passthrough(&gcc)) {
   1447       FREE(gcc.shader);
   1448       gcc.shader = NULL;
   1449    }
   1450 
   1451    /* no need to call toy_tgsi_cleanup() */
   1452    toy_compiler_cleanup(&gcc.tc);
   1453 
   1454    return append_gs_to_vs(vs, gcc.shader, num_verts);
   1455 }
   1456