Home | History | Annotate | Download | only in i965
      1 /*
      2  * Copyright  2014 Intel Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21  * IN THE SOFTWARE.
     22  *
     23  * This code is based on original work by Ilia Mirkin.
     24  */
     25 
     26 /**
     27  * \file gen6_gs_visitor.cpp
     28  *
     29  * Gen6 geometry shader implementation
     30  */
     31 
     32 #include "gen6_gs_visitor.h"
     33 #include "brw_eu.h"
     34 
     35 namespace brw {
     36 
     37 void
     38 gen6_gs_visitor::emit_prolog()
     39 {
     40    vec4_gs_visitor::emit_prolog();
     41 
     42    /* Gen6 geometry shaders require to allocate an initial VUE handle via
     43     * FF_SYNC message, however the documentation remarks that only one thread
     44     * can write to the URB simultaneously and the FF_SYNC message provides the
     45     * synchronization mechanism for this, so using this message effectively
     46     * stalls the thread until it is its turn to write to the URB. Because of
     47     * this, the best way to implement geometry shader algorithms in gen6 is to
     48     * execute the algorithm before the FF_SYNC message to maximize parallelism.
     49     *
     50     * To achieve this we buffer the geometry shader outputs for each emitted
     51     * vertex in vertex_output during operation. Then, when we have processed
     52     * the last vertex (that is, at thread end time), we send the FF_SYNC
     53     * message to allocate the initial VUE handle and write all buffered vertex
     54     * data to the URB in one go.
     55     *
     56     * For each emitted vertex, vertex_output will hold vue_map.num_slots
     57     * data items plus one additional item to hold required flags
     58     * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
     59     * which come right after the data items for that vertex. Vertex data and
     60     * flags for the next vertex come right after the data items and flags for
     61     * the previous vertex.
     62     */
     63    this->current_annotation = "gen6 prolog";
     64    this->vertex_output = src_reg(this,
     65                                  glsl_type::uint_type,
     66                                  (prog_data->vue_map.num_slots + 1) *
     67                                  nir->info->gs.vertices_out);
     68    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
     69    emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
     70 
     71    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
     72     * so initialize it once to R0.
     73     */
     74    vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
     75                                      retype(brw_vec8_grf(0, 0),
     76                                             BRW_REGISTER_TYPE_UD)));
     77    inst->force_writemask_all = true;
     78 
     79    /* This will be used as a temporary to store writeback data of FF_SYNC
     80     * and URB_WRITE messages.
     81     */
     82    this->temp = src_reg(this, glsl_type::uint_type);
     83 
     84    /* This will be used to know when we are processing the first vertex of
     85     * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
     86     * that we are processing the first vertex in the primitive and to zero
     87     * otherwise. This way we can use its value directly in the URB write
     88     * headers.
     89     */
     90    this->first_vertex = src_reg(this, glsl_type::uint_type);
     91    emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
     92 
     93    /* The FF_SYNC message requires to know the number of primitives generated,
     94     * so keep a counter for this.
     95     */
     96    this->prim_count = src_reg(this, glsl_type::uint_type);
     97    emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
     98 
     99    if (prog->info.has_transform_feedback_varyings) {
    100       /* Create a virtual register to hold destination indices in SOL */
    101       this->destination_indices = src_reg(this, glsl_type::uvec4_type);
    102       /* Create a virtual register to hold number of written primitives */
    103       this->sol_prim_written = src_reg(this, glsl_type::uint_type);
    104       /* Create a virtual register to hold Streamed Vertex Buffer Indices */
    105       this->svbi = src_reg(this, glsl_type::uvec4_type);
    106       /* Create a virtual register to hold max values of SVBI */
    107       this->max_svbi = src_reg(this, glsl_type::uvec4_type);
    108       emit(MOV(dst_reg(this->max_svbi),
    109                src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
    110 
    111       xfb_setup();
    112    }
    113 
    114    /* PrimitveID is delivered in r0.1 of the thread payload. If the program
    115     * needs it we have to move it to a separate register where we can map
    116     * the atttribute.
    117     *
    118     * Notice that we cannot use a virtual register for this, because we need to
    119     * map all input attributes to hardware registers in setup_payload(),
    120     * which happens before virtual registers are mapped to hardware registers.
    121     * We could work around that issue if we were able to compute the first
    122     * non-payload register here and move the PrimitiveID information to that
    123     * register, but we can't because at this point we don't know the final
    124     * number uniforms that will be included in the payload.
    125     *
    126     * So, what we do is to place PrimitiveID information in r1, which is always
    127     * delivered as part of the payload, but its only populated with data
    128     * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
    129     * in the 3DSTATE_GS state packet. That information can be obtained by other
    130     * means though, so we can safely use r1 for this purpose.
    131     */
    132    if (gs_prog_data->include_primitive_id) {
    133       this->primitive_id =
    134          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
    135       emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
    136    }
    137 }
    138 
    139 void
    140 gen6_gs_visitor::gs_emit_vertex(int stream_id)
    141 {
    142    this->current_annotation = "gen6 emit vertex";
    143 
    144    /* Buffer all output slots for this vertex in vertex_output */
    145    for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
    146       int varying = prog_data->vue_map.slot_to_varying[slot];
    147       if (varying != VARYING_SLOT_PSIZ) {
    148          dst_reg dst(this->vertex_output);
    149          dst.reladdr = ralloc(mem_ctx, src_reg);
    150          memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
    151          emit_urb_slot(dst, varying);
    152       } else {
    153          /* The PSIZ slot can pack multiple varyings in different channels
    154           * and emit_urb_slot() will produce a MOV instruction for each of
    155           * them. Since we are writing to an array, that will translate to
    156           * possibly multiple MOV instructions with an array destination and
    157           * each will generate a scratch write with the same offset into
    158           * scratch space (thus, each one overwriting the previous). This is
    159           * not what we want. What we will do instead is emit PSIZ to a
    160           * a regular temporary register, then move that resgister into the
    161           * array. This way we only have one instruction with an array
    162           * destination and we only produce a single scratch write.
    163           */
    164          dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
    165          emit_urb_slot(tmp, varying);
    166          dst_reg dst(this->vertex_output);
    167          dst.reladdr = ralloc(mem_ctx, src_reg);
    168          memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
    169          vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
    170          inst->force_writemask_all = true;
    171       }
    172 
    173       emit(ADD(dst_reg(this->vertex_output_offset),
    174                this->vertex_output_offset, brw_imm_ud(1u)));
    175    }
    176 
    177    /* Now buffer flags for this vertex */
    178    dst_reg dst(this->vertex_output);
    179    dst.reladdr = ralloc(mem_ctx, src_reg);
    180    memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
    181    if (nir->info->gs.output_primitive == GL_POINTS) {
    182       /* If we are outputting points, then every vertex has PrimStart and
    183        * PrimEnd set.
    184        */
    185       emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
    186                               URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
    187       emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
    188    } else {
    189       /* Otherwise, we can only set the PrimStart flag, which we have stored
    190        * in the first_vertex register. We will have to wait until we execute
    191        * EndPrimitive() or we end the thread to set the PrimEnd flag on a
    192        * vertex.
    193        */
    194       emit(OR(dst, this->first_vertex,
    195               brw_imm_ud(gs_prog_data->output_topology <<
    196                          URB_WRITE_PRIM_TYPE_SHIFT)));
    197       emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
    198    }
    199    emit(ADD(dst_reg(this->vertex_output_offset),
    200             this->vertex_output_offset, brw_imm_ud(1u)));
    201 }
    202 
    203 void
    204 gen6_gs_visitor::gs_end_primitive()
    205 {
    206    this->current_annotation = "gen6 end primitive";
    207    /* Calling EndPrimitive() is optional for point output. In this case we set
    208     * the PrimEnd flag when we process EmitVertex().
    209     */
    210    if (nir->info->gs.output_primitive == GL_POINTS)
    211       return;
    212 
    213    /* Otherwise we know that the last vertex we have processed was the last
    214     * vertex in the primitive and we need to set its PrimEnd flag, so do this
    215     * unless we haven't emitted that vertex at all (vertex_count != 0).
    216     *
    217     * Notice that we have already incremented vertex_count when we processed
    218     * the last emit_vertex, so we need to take that into account in the
    219     * comparison below (hence the num_output_vertices + 1 in the comparison
    220     * below).
    221     */
    222    unsigned num_output_vertices = nir->info->gs.vertices_out;
    223    emit(CMP(dst_null_ud(), this->vertex_count,
    224             brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
    225    vec4_instruction *inst = emit(CMP(dst_null_ud(),
    226                                      this->vertex_count, brw_imm_ud(0u),
    227                                      BRW_CONDITIONAL_NEQ));
    228    inst->predicate = BRW_PREDICATE_NORMAL;
    229    emit(IF(BRW_PREDICATE_NORMAL));
    230    {
    231       /* vertex_output_offset is already pointing at the first entry of the
    232        * next vertex. So subtract 1 to modify the flags for the previous
    233        * vertex.
    234        */
    235       src_reg offset(this, glsl_type::uint_type);
    236       emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
    237 
    238       src_reg dst(this->vertex_output);
    239       dst.reladdr = ralloc(mem_ctx, src_reg);
    240       memcpy(dst.reladdr, &offset, sizeof(src_reg));
    241 
    242       emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
    243       emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
    244 
    245       /* Set the first vertex flag to indicate that the next vertex will start
    246        * a primitive.
    247        */
    248       emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
    249    }
    250    emit(BRW_OPCODE_ENDIF);
    251 }
    252 
    253 void
    254 gen6_gs_visitor::emit_urb_write_header(int mrf)
    255 {
    256    this->current_annotation = "gen6 urb header";
    257    /* Compute offset of the flags for the current vertex in vertex_output and
    258     * write them in dw2 of the message header.
    259     *
    260     * Notice that by the time that emit_thread_end() calls here
    261     * vertex_output_offset should point to the first data item of the current
    262     * vertex in vertex_output, thus we only need to add the number of output
    263     * slots per vertex to that offset to obtain the flags data offset.
    264     */
    265    src_reg flags_offset(this, glsl_type::uint_type);
    266    emit(ADD(dst_reg(flags_offset),
    267             this->vertex_output_offset,
    268             brw_imm_d(prog_data->vue_map.num_slots)));
    269 
    270    src_reg flags_data(this->vertex_output);
    271    flags_data.reladdr = ralloc(mem_ctx, src_reg);
    272    memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
    273 
    274    emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
    275 }
    276 
    277 static int
    278 align_interleaved_urb_mlen(int mlen)
    279 {
    280    /* URB data written (does not include the message header reg) must
    281     * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
    282     * section 5.4.3.2.2: URB_INTERLEAVED.
    283     */
    284    if ((mlen % 2) != 1)
    285       mlen++;
    286    return mlen;
    287 }
    288 
    289 void
    290 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
    291                                        int last_mrf, int urb_offset)
    292 {
    293    vec4_instruction *inst = NULL;
    294 
    295    if (!complete) {
    296       /* If the vertex is not complete we don't have to do anything special */
    297       inst = emit(GS_OPCODE_URB_WRITE);
    298       inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
    299    } else {
    300       /* Otherwise we always request to allocate a new VUE handle. If this is
    301        * the last write before the EOT message and the new handle never gets
    302        * used it will be dereferenced when we send the EOT message. This is
    303        * necessary to avoid different setups for the EOT message (one for the
    304        * case when there is no output and another for the case when there is)
    305        * which would require to end the program with an IF/ELSE/ENDIF block,
    306        * something we do not want.
    307        */
    308       inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
    309       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
    310       inst->dst = dst_reg(MRF, base_mrf);
    311       inst->src[0] = this->temp;
    312    }
    313 
    314    inst->base_mrf = base_mrf;
    315    inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
    316    inst->offset = urb_offset;
    317 }
    318 
    319 void
    320 gen6_gs_visitor::emit_thread_end()
    321 {
    322    /* Make sure the current primitive is ended: we know it is not ended when
    323     * first_vertex is not zero. This is only relevant for outputs other than
    324     * points because in the point case we set PrimEnd on all vertices.
    325     */
    326    if (nir->info->gs.output_primitive != GL_POINTS) {
    327       emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
    328       emit(IF(BRW_PREDICATE_NORMAL));
    329       gs_end_primitive();
    330       emit(BRW_OPCODE_ENDIF);
    331    }
    332 
    333    /* Here we have to:
    334     * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
    335     * 2) Loop over all buffered vertex data and write it to corresponding
    336     *    URB entries.
    337     * 3) Allocate new VUE handles for all vertices other than the first.
    338     * 4) Send a final EOT message.
    339     */
    340 
    341    /* MRF 0 is reserved for the debugger, so start with message header
    342     * in MRF 1.
    343     */
    344    int base_mrf = 1;
    345 
    346    /* In the process of generating our URB write message contents, we
    347     * may need to unspill a register or load from an array.  Those
    348     * reads would use MRFs 21..23
    349     */
    350    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
    351 
    352    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
    353    emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
    354    emit(IF(BRW_PREDICATE_NORMAL));
    355    {
    356       this->current_annotation = "gen6 thread end: ff_sync";
    357 
    358       vec4_instruction *inst;
    359       if (prog->info.has_transform_feedback_varyings) {
    360          src_reg sol_temp(this, glsl_type::uvec4_type);
    361          emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
    362               dst_reg(this->svbi),
    363               this->vertex_count,
    364               this->prim_count,
    365               sol_temp);
    366          inst = emit(GS_OPCODE_FF_SYNC,
    367                      dst_reg(this->temp), this->prim_count, this->svbi);
    368       } else {
    369          inst = emit(GS_OPCODE_FF_SYNC,
    370                      dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
    371       }
    372       inst->base_mrf = base_mrf;
    373 
    374       /* Loop over all buffered vertices and emit URB write messages */
    375       this->current_annotation = "gen6 thread end: urb writes init";
    376       src_reg vertex(this, glsl_type::uint_type);
    377       emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
    378       emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
    379 
    380       this->current_annotation = "gen6 thread end: urb writes";
    381       emit(BRW_OPCODE_DO);
    382       {
    383          emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
    384          inst = emit(BRW_OPCODE_BREAK);
    385          inst->predicate = BRW_PREDICATE_NORMAL;
    386 
    387          /* First we prepare the message header */
    388          emit_urb_write_header(base_mrf);
    389 
    390          /* Then add vertex data to the message in interleaved fashion */
    391          int slot = 0;
    392          bool complete = false;
    393          do {
    394             int mrf = base_mrf + 1;
    395 
    396             /* URB offset is in URB row increments, and each of our MRFs is half
    397              * of one of those, since we're doing interleaved writes.
    398              */
    399             int urb_offset = slot / 2;
    400 
    401             for (; slot < prog_data->vue_map.num_slots; ++slot) {
    402                int varying = prog_data->vue_map.slot_to_varying[slot];
    403                current_annotation = output_reg_annotation[varying];
    404 
    405                /* Compute offset of this slot for the current vertex
    406                 * in vertex_output
    407                 */
    408                src_reg data(this->vertex_output);
    409                data.reladdr = ralloc(mem_ctx, src_reg);
    410                memcpy(data.reladdr, &this->vertex_output_offset,
    411                       sizeof(src_reg));
    412 
    413                /* Copy this slot to the appropriate message register */
    414                dst_reg reg = dst_reg(MRF, mrf);
    415                reg.type = output_reg[varying][0].type;
    416                data.type = reg.type;
    417                vec4_instruction *inst = emit(MOV(reg, data));
    418                inst->force_writemask_all = true;
    419 
    420                mrf++;
    421                emit(ADD(dst_reg(this->vertex_output_offset),
    422                         this->vertex_output_offset, brw_imm_ud(1u)));
    423 
    424                /* If this was max_usable_mrf, we can't fit anything more into
    425                 * this URB WRITE. Same if we reached the max. message length.
    426                 */
    427                if (mrf > max_usable_mrf ||
    428                    align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
    429                   slot++;
    430                   break;
    431                }
    432             }
    433 
    434             complete = slot >= prog_data->vue_map.num_slots;
    435             emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
    436          } while (!complete);
    437 
    438          /* Skip over the flags data item so that vertex_output_offset points
    439           * to the first data item of the next vertex, so that we can start
    440           * writing the next vertex.
    441           */
    442          emit(ADD(dst_reg(this->vertex_output_offset),
    443                   this->vertex_output_offset, brw_imm_ud(1u)));
    444 
    445          emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
    446       }
    447       emit(BRW_OPCODE_WHILE);
    448 
    449       if (prog->info.has_transform_feedback_varyings)
    450          xfb_write();
    451    }
    452    emit(BRW_OPCODE_ENDIF);
    453 
    454    /* Finally, emit EOT message.
    455     *
    456     * In gen6 we need to end the thread differently depending on whether we have
    457     * emitted at least one vertex or not. In case we did, the EOT message must
    458     * always include the COMPLETE flag or else the GPU hangs. If we have not
    459     * produced any output we can't use the COMPLETE flag.
    460     *
    461     * However, this would lead us to end the program with an ENDIF opcode,
    462     * which we want to avoid, so what we do is that we always request a new
    463     * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
    464     * With this we make sure that whether we have emitted at least one vertex
    465     * or none at all, we have to finish the thread without writing to the URB,
    466     * which works for both cases by setting the COMPLETE and UNUSED flags in
    467     * the EOT message.
    468     */
    469    this->current_annotation = "gen6 thread end: EOT";
    470 
    471    if (prog->info.has_transform_feedback_varyings) {
    472       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
    473       src_reg data(this, glsl_type::uint_type);
    474       emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
    475       emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
    476       emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
    477    }
    478 
    479    vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
    480    inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
    481    inst->base_mrf = base_mrf;
    482    inst->mlen = 1;
    483 }
    484 
    485 void
    486 gen6_gs_visitor::setup_payload()
    487 {
    488    int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
    489 
    490    /* Attributes are going to be interleaved, so one register contains two
    491     * attribute slots.
    492     */
    493    int attributes_per_reg = 2;
    494 
    495    /* If a geometry shader tries to read from an input that wasn't written by
    496     * the vertex shader, that produces undefined results, but it shouldn't
    497     * crash anything.  So initialize attribute_map to zeros--that ensures that
    498     * these undefined results are read from r0.
    499     */
    500    memset(attribute_map, 0, sizeof(attribute_map));
    501 
    502    int reg = 0;
    503 
    504    /* The payload always contains important data in r0. */
    505    reg++;
    506 
    507    /* r1 is always part of the payload and it holds information relevant
    508     * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
    509     * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
    510     * information (and move the original value to a virtual register if
    511     * necessary).
    512     */
    513    if (gs_prog_data->include_primitive_id)
    514       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
    515    reg++;
    516 
    517    reg = setup_uniforms(reg);
    518 
    519    reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
    520 
    521    lower_attributes_to_hw_regs(attribute_map, true);
    522 
    523    this->first_non_payload_grf = reg;
    524 }
    525 
    526 void
    527 gen6_gs_visitor::xfb_setup()
    528 {
    529    static const unsigned swizzle_for_offset[4] = {
    530       BRW_SWIZZLE4(0, 1, 2, 3),
    531       BRW_SWIZZLE4(1, 2, 3, 3),
    532       BRW_SWIZZLE4(2, 3, 3, 3),
    533       BRW_SWIZZLE4(3, 3, 3, 3)
    534    };
    535 
    536    const struct gl_transform_feedback_info *linked_xfb_info =
    537       this->prog->sh.LinkedTransformFeedback;
    538    int i;
    539 
    540    /* Make sure that the VUE slots won't overflow the unsigned chars in
    541     * prog_data->transform_feedback_bindings[].
    542     */
    543    STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
    544 
    545    /* Make sure that we don't need more binding table entries than we've
    546     * set aside for use in transform feedback.  (We shouldn't, since we
    547     * set aside enough binding table entries to have one per component).
    548     */
    549    assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
    550 
    551    gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
    552    for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
    553       gs_prog_data->transform_feedback_bindings[i] =
    554          linked_xfb_info->Outputs[i].OutputRegister;
    555       gs_prog_data->transform_feedback_swizzles[i] =
    556          swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
    557    }
    558 }
    559 
    560 void
    561 gen6_gs_visitor::xfb_write()
    562 {
    563    unsigned num_verts;
    564 
    565    if (!gs_prog_data->num_transform_feedback_bindings)
    566       return;
    567 
    568    switch (gs_prog_data->output_topology) {
    569    case _3DPRIM_POINTLIST:
    570       num_verts = 1;
    571       break;
    572    case _3DPRIM_LINELIST:
    573    case _3DPRIM_LINESTRIP:
    574    case _3DPRIM_LINELOOP:
    575       num_verts = 2;
    576       break;
    577    case _3DPRIM_TRILIST:
    578    case _3DPRIM_TRIFAN:
    579    case _3DPRIM_TRISTRIP:
    580    case _3DPRIM_RECTLIST:
    581       num_verts = 3;
    582       break;
    583    case _3DPRIM_QUADLIST:
    584    case _3DPRIM_QUADSTRIP:
    585    case _3DPRIM_POLYGON:
    586       num_verts = 3;
    587       break;
    588    default:
    589       unreachable("Unexpected primitive type in Gen6 SOL program.");
    590    }
    591 
    592    this->current_annotation = "gen6 thread end: svb writes init";
    593 
    594    emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
    595    emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
    596 
    597    /* Check that at least one primitive can be written
    598     *
    599     * Note: since we use the binding table to keep track of buffer offsets
    600     * and stride, the GS doesn't need to keep track of a separate pointer
    601     * into each buffer; it uses a single pointer which increments by 1 for
    602     * each vertex.  So we use SVBI0 for this pointer, regardless of whether
    603     * transform feedback is in interleaved or separate attribs mode.
    604     */
    605    src_reg sol_temp(this, glsl_type::uvec4_type);
    606    emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
    607 
    608    /* Compare SVBI calculated number with the maximum value, which is
    609     * in R1.4 (previously saved in this->max_svbi) for gen6.
    610     */
    611    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
    612    emit(IF(BRW_PREDICATE_NORMAL));
    613    {
    614       vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
    615                                         brw_imm_vf4(brw_float_to_vf(0.0),
    616                                                     brw_float_to_vf(1.0),
    617                                                     brw_float_to_vf(2.0),
    618                                                     brw_float_to_vf(0.0))));
    619       inst->force_writemask_all = true;
    620 
    621       emit(ADD(dst_reg(this->destination_indices),
    622                this->destination_indices,
    623                this->svbi));
    624    }
    625    emit(BRW_OPCODE_ENDIF);
    626 
    627    /* Write transform feedback data for all processed vertices. */
    628    for (int i = 0; i < (int)nir->info->gs.vertices_out; i++) {
    629       emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
    630       emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
    631                BRW_CONDITIONAL_L));
    632       emit(IF(BRW_PREDICATE_NORMAL));
    633       {
    634          xfb_program(i, num_verts);
    635       }
    636       emit(BRW_OPCODE_ENDIF);
    637    }
    638 }
    639 
    640 void
    641 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
    642 {
    643    unsigned binding;
    644    unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
    645    src_reg sol_temp(this, glsl_type::uvec4_type);
    646 
    647    /* Check for buffer overflow: we need room to write the complete primitive
    648     * (all vertices). Otherwise, avoid writing any vertices for it
    649     */
    650    emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
    651    emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
    652    emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
    653    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
    654    emit(IF(BRW_PREDICATE_NORMAL));
    655    {
    656       /* Avoid overwriting MRF 1 as it is used as URB write message header */
    657       dst_reg mrf_reg(MRF, 2);
    658 
    659       this->current_annotation = "gen6: emit SOL vertex data";
    660       /* For each vertex, generate code to output each varying using the
    661        * appropriate binding table entry.
    662        */
    663       for (binding = 0; binding < num_bindings; ++binding) {
    664          unsigned char varying =
    665             gs_prog_data->transform_feedback_bindings[binding];
    666 
    667          /* Set up the correct destination index for this vertex */
    668          vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
    669                                        mrf_reg,
    670                                        this->destination_indices);
    671          inst->sol_vertex = vertex % num_verts;
    672 
    673          /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
    674           *
    675           *   "Prior to End of Thread with a URB_WRITE, the kernel must
    676           *   ensure that all writes are complete by sending the final
    677           *   write as a committed write."
    678           */
    679          bool final_write = binding == (unsigned) num_bindings - 1 &&
    680                             inst->sol_vertex == num_verts - 1;
    681 
    682          /* Compute offset of this varying for the current vertex
    683           * in vertex_output
    684           */
    685          this->current_annotation = output_reg_annotation[varying];
    686          src_reg data(this->vertex_output);
    687          data.reladdr = ralloc(mem_ctx, src_reg);
    688          int offset = get_vertex_output_offset_for_varying(vertex, varying);
    689          emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
    690          memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
    691          data.type = output_reg[varying][0].type;
    692 
    693          /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
    694           * same slot, so make sure we write the appropriate channel
    695           */
    696          if (varying == VARYING_SLOT_PSIZ)
    697             data.swizzle = BRW_SWIZZLE_WWWW;
    698          else if (varying == VARYING_SLOT_LAYER)
    699             data.swizzle = BRW_SWIZZLE_YYYY;
    700          else if (varying == VARYING_SLOT_VIEWPORT)
    701             data.swizzle = BRW_SWIZZLE_ZZZZ;
    702          else
    703             data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
    704 
    705          /* Write data */
    706          inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
    707          inst->sol_binding = binding;
    708          inst->sol_final_write = final_write;
    709 
    710          if (final_write) {
    711             /* This is the last vertex of the primitive, then increment
    712              * SO num primitive counter and destination indices.
    713              */
    714             emit(ADD(dst_reg(this->destination_indices),
    715                      this->destination_indices,
    716                      brw_imm_ud(num_verts)));
    717             emit(ADD(dst_reg(this->sol_prim_written),
    718                      this->sol_prim_written, brw_imm_ud(1u)));
    719          }
    720 
    721       }
    722       this->current_annotation = NULL;
    723    }
    724    emit(BRW_OPCODE_ENDIF);
    725 }
    726 
    727 int
    728 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
    729 {
    730    /* Find the output slot assigned to this varying.
    731     *
    732     * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
    733     * as VARYING_SLOT_PSIZ.
    734     */
    735    if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
    736       varying = VARYING_SLOT_PSIZ;
    737    int slot = prog_data->vue_map.varying_to_slot[varying];
    738 
    739    if (slot < 0) {
    740       /* This varying does not exist in the VUE so we are not writing to it
    741        * and its value is undefined. We still want to return a valid offset
    742        * into vertex_output though, to prevent any out-of-bound accesses into
    743        * the vertex_output array. Since the value for this varying is undefined
    744        * we don't really care for the value we assign to it, so any offset
    745        * within the limits of vertex_output will do.
    746        */
    747       slot = 0;
    748    }
    749 
    750    return vertex * (prog_data->vue_map.num_slots + 1) + slot;
    751 }
    752 
    753 } /* namespace brw */
    754