1 /* 2 * Copyright 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * This code is based on original work by Ilia Mirkin. 24 */ 25 26 /** 27 * \file gen6_gs_visitor.cpp 28 * 29 * Gen6 geometry shader implementation 30 */ 31 32 #include "gen6_gs_visitor.h" 33 #include "brw_eu.h" 34 35 namespace brw { 36 37 void 38 gen6_gs_visitor::emit_prolog() 39 { 40 vec4_gs_visitor::emit_prolog(); 41 42 /* Gen6 geometry shaders require to allocate an initial VUE handle via 43 * FF_SYNC message, however the documentation remarks that only one thread 44 * can write to the URB simultaneously and the FF_SYNC message provides the 45 * synchronization mechanism for this, so using this message effectively 46 * stalls the thread until it is its turn to write to the URB. Because of 47 * this, the best way to implement geometry shader algorithms in gen6 is to 48 * execute the algorithm before the FF_SYNC message to maximize parallelism. 49 * 50 * To achieve this we buffer the geometry shader outputs for each emitted 51 * vertex in vertex_output during operation. Then, when we have processed 52 * the last vertex (that is, at thread end time), we send the FF_SYNC 53 * message to allocate the initial VUE handle and write all buffered vertex 54 * data to the URB in one go. 55 * 56 * For each emitted vertex, vertex_output will hold vue_map.num_slots 57 * data items plus one additional item to hold required flags 58 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message) 59 * which come right after the data items for that vertex. Vertex data and 60 * flags for the next vertex come right after the data items and flags for 61 * the previous vertex. 62 */ 63 this->current_annotation = "gen6 prolog"; 64 this->vertex_output = src_reg(this, 65 glsl_type::uint_type, 66 (prog_data->vue_map.num_slots + 1) * 67 nir->info->gs.vertices_out); 68 this->vertex_output_offset = src_reg(this, glsl_type::uint_type); 69 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); 70 71 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES), 72 * so initialize it once to R0. 73 */ 74 vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1), 75 retype(brw_vec8_grf(0, 0), 76 BRW_REGISTER_TYPE_UD))); 77 inst->force_writemask_all = true; 78 79 /* This will be used as a temporary to store writeback data of FF_SYNC 80 * and URB_WRITE messages. 81 */ 82 this->temp = src_reg(this, glsl_type::uint_type); 83 84 /* This will be used to know when we are processing the first vertex of 85 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know 86 * that we are processing the first vertex in the primitive and to zero 87 * otherwise. This way we can use its value directly in the URB write 88 * headers. 89 */ 90 this->first_vertex = src_reg(this, glsl_type::uint_type); 91 emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START))); 92 93 /* The FF_SYNC message requires to know the number of primitives generated, 94 * so keep a counter for this. 95 */ 96 this->prim_count = src_reg(this, glsl_type::uint_type); 97 emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u))); 98 99 if (prog->info.has_transform_feedback_varyings) { 100 /* Create a virtual register to hold destination indices in SOL */ 101 this->destination_indices = src_reg(this, glsl_type::uvec4_type); 102 /* Create a virtual register to hold number of written primitives */ 103 this->sol_prim_written = src_reg(this, glsl_type::uint_type); 104 /* Create a virtual register to hold Streamed Vertex Buffer Indices */ 105 this->svbi = src_reg(this, glsl_type::uvec4_type); 106 /* Create a virtual register to hold max values of SVBI */ 107 this->max_svbi = src_reg(this, glsl_type::uvec4_type); 108 emit(MOV(dst_reg(this->max_svbi), 109 src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD)))); 110 111 xfb_setup(); 112 } 113 114 /* PrimitveID is delivered in r0.1 of the thread payload. If the program 115 * needs it we have to move it to a separate register where we can map 116 * the atttribute. 117 * 118 * Notice that we cannot use a virtual register for this, because we need to 119 * map all input attributes to hardware registers in setup_payload(), 120 * which happens before virtual registers are mapped to hardware registers. 121 * We could work around that issue if we were able to compute the first 122 * non-payload register here and move the PrimitiveID information to that 123 * register, but we can't because at this point we don't know the final 124 * number uniforms that will be included in the payload. 125 * 126 * So, what we do is to place PrimitiveID information in r1, which is always 127 * delivered as part of the payload, but its only populated with data 128 * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE 129 * in the 3DSTATE_GS state packet. That information can be obtained by other 130 * means though, so we can safely use r1 for this purpose. 131 */ 132 if (gs_prog_data->include_primitive_id) { 133 this->primitive_id = 134 src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 135 emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id)); 136 } 137 } 138 139 void 140 gen6_gs_visitor::gs_emit_vertex(int stream_id) 141 { 142 this->current_annotation = "gen6 emit vertex"; 143 144 /* Buffer all output slots for this vertex in vertex_output */ 145 for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { 146 int varying = prog_data->vue_map.slot_to_varying[slot]; 147 if (varying != VARYING_SLOT_PSIZ) { 148 dst_reg dst(this->vertex_output); 149 dst.reladdr = ralloc(mem_ctx, src_reg); 150 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 151 emit_urb_slot(dst, varying); 152 } else { 153 /* The PSIZ slot can pack multiple varyings in different channels 154 * and emit_urb_slot() will produce a MOV instruction for each of 155 * them. Since we are writing to an array, that will translate to 156 * possibly multiple MOV instructions with an array destination and 157 * each will generate a scratch write with the same offset into 158 * scratch space (thus, each one overwriting the previous). This is 159 * not what we want. What we will do instead is emit PSIZ to a 160 * a regular temporary register, then move that resgister into the 161 * array. This way we only have one instruction with an array 162 * destination and we only produce a single scratch write. 163 */ 164 dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type)); 165 emit_urb_slot(tmp, varying); 166 dst_reg dst(this->vertex_output); 167 dst.reladdr = ralloc(mem_ctx, src_reg); 168 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 169 vec4_instruction *inst = emit(MOV(dst, src_reg(tmp))); 170 inst->force_writemask_all = true; 171 } 172 173 emit(ADD(dst_reg(this->vertex_output_offset), 174 this->vertex_output_offset, brw_imm_ud(1u))); 175 } 176 177 /* Now buffer flags for this vertex */ 178 dst_reg dst(this->vertex_output); 179 dst.reladdr = ralloc(mem_ctx, src_reg); 180 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 181 if (nir->info->gs.output_primitive == GL_POINTS) { 182 /* If we are outputting points, then every vertex has PrimStart and 183 * PrimEnd set. 184 */ 185 emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) | 186 URB_WRITE_PRIM_START | URB_WRITE_PRIM_END))); 187 emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u))); 188 } else { 189 /* Otherwise, we can only set the PrimStart flag, which we have stored 190 * in the first_vertex register. We will have to wait until we execute 191 * EndPrimitive() or we end the thread to set the PrimEnd flag on a 192 * vertex. 193 */ 194 emit(OR(dst, this->first_vertex, 195 brw_imm_ud(gs_prog_data->output_topology << 196 URB_WRITE_PRIM_TYPE_SHIFT))); 197 emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u))); 198 } 199 emit(ADD(dst_reg(this->vertex_output_offset), 200 this->vertex_output_offset, brw_imm_ud(1u))); 201 } 202 203 void 204 gen6_gs_visitor::gs_end_primitive() 205 { 206 this->current_annotation = "gen6 end primitive"; 207 /* Calling EndPrimitive() is optional for point output. In this case we set 208 * the PrimEnd flag when we process EmitVertex(). 209 */ 210 if (nir->info->gs.output_primitive == GL_POINTS) 211 return; 212 213 /* Otherwise we know that the last vertex we have processed was the last 214 * vertex in the primitive and we need to set its PrimEnd flag, so do this 215 * unless we haven't emitted that vertex at all (vertex_count != 0). 216 * 217 * Notice that we have already incremented vertex_count when we processed 218 * the last emit_vertex, so we need to take that into account in the 219 * comparison below (hence the num_output_vertices + 1 in the comparison 220 * below). 221 */ 222 unsigned num_output_vertices = nir->info->gs.vertices_out; 223 emit(CMP(dst_null_ud(), this->vertex_count, 224 brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L)); 225 vec4_instruction *inst = emit(CMP(dst_null_ud(), 226 this->vertex_count, brw_imm_ud(0u), 227 BRW_CONDITIONAL_NEQ)); 228 inst->predicate = BRW_PREDICATE_NORMAL; 229 emit(IF(BRW_PREDICATE_NORMAL)); 230 { 231 /* vertex_output_offset is already pointing at the first entry of the 232 * next vertex. So subtract 1 to modify the flags for the previous 233 * vertex. 234 */ 235 src_reg offset(this, glsl_type::uint_type); 236 emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1))); 237 238 src_reg dst(this->vertex_output); 239 dst.reladdr = ralloc(mem_ctx, src_reg); 240 memcpy(dst.reladdr, &offset, sizeof(src_reg)); 241 242 emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END))); 243 emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u))); 244 245 /* Set the first vertex flag to indicate that the next vertex will start 246 * a primitive. 247 */ 248 emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START))); 249 } 250 emit(BRW_OPCODE_ENDIF); 251 } 252 253 void 254 gen6_gs_visitor::emit_urb_write_header(int mrf) 255 { 256 this->current_annotation = "gen6 urb header"; 257 /* Compute offset of the flags for the current vertex in vertex_output and 258 * write them in dw2 of the message header. 259 * 260 * Notice that by the time that emit_thread_end() calls here 261 * vertex_output_offset should point to the first data item of the current 262 * vertex in vertex_output, thus we only need to add the number of output 263 * slots per vertex to that offset to obtain the flags data offset. 264 */ 265 src_reg flags_offset(this, glsl_type::uint_type); 266 emit(ADD(dst_reg(flags_offset), 267 this->vertex_output_offset, 268 brw_imm_d(prog_data->vue_map.num_slots))); 269 270 src_reg flags_data(this->vertex_output); 271 flags_data.reladdr = ralloc(mem_ctx, src_reg); 272 memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg)); 273 274 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data); 275 } 276 277 static int 278 align_interleaved_urb_mlen(int mlen) 279 { 280 /* URB data written (does not include the message header reg) must 281 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, 282 * section 5.4.3.2.2: URB_INTERLEAVED. 283 */ 284 if ((mlen % 2) != 1) 285 mlen++; 286 return mlen; 287 } 288 289 void 290 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf, 291 int last_mrf, int urb_offset) 292 { 293 vec4_instruction *inst = NULL; 294 295 if (!complete) { 296 /* If the vertex is not complete we don't have to do anything special */ 297 inst = emit(GS_OPCODE_URB_WRITE); 298 inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; 299 } else { 300 /* Otherwise we always request to allocate a new VUE handle. If this is 301 * the last write before the EOT message and the new handle never gets 302 * used it will be dereferenced when we send the EOT message. This is 303 * necessary to avoid different setups for the EOT message (one for the 304 * case when there is no output and another for the case when there is) 305 * which would require to end the program with an IF/ELSE/ENDIF block, 306 * something we do not want. 307 */ 308 inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE); 309 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE; 310 inst->dst = dst_reg(MRF, base_mrf); 311 inst->src[0] = this->temp; 312 } 313 314 inst->base_mrf = base_mrf; 315 inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf); 316 inst->offset = urb_offset; 317 } 318 319 void 320 gen6_gs_visitor::emit_thread_end() 321 { 322 /* Make sure the current primitive is ended: we know it is not ended when 323 * first_vertex is not zero. This is only relevant for outputs other than 324 * points because in the point case we set PrimEnd on all vertices. 325 */ 326 if (nir->info->gs.output_primitive != GL_POINTS) { 327 emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z)); 328 emit(IF(BRW_PREDICATE_NORMAL)); 329 gs_end_primitive(); 330 emit(BRW_OPCODE_ENDIF); 331 } 332 333 /* Here we have to: 334 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle. 335 * 2) Loop over all buffered vertex data and write it to corresponding 336 * URB entries. 337 * 3) Allocate new VUE handles for all vertices other than the first. 338 * 4) Send a final EOT message. 339 */ 340 341 /* MRF 0 is reserved for the debugger, so start with message header 342 * in MRF 1. 343 */ 344 int base_mrf = 1; 345 346 /* In the process of generating our URB write message contents, we 347 * may need to unspill a register or load from an array. Those 348 * reads would use MRFs 21..23 349 */ 350 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen); 351 352 /* Issue the FF_SYNC message and obtain the initial VUE handle. */ 353 emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G)); 354 emit(IF(BRW_PREDICATE_NORMAL)); 355 { 356 this->current_annotation = "gen6 thread end: ff_sync"; 357 358 vec4_instruction *inst; 359 if (prog->info.has_transform_feedback_varyings) { 360 src_reg sol_temp(this, glsl_type::uvec4_type); 361 emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES, 362 dst_reg(this->svbi), 363 this->vertex_count, 364 this->prim_count, 365 sol_temp); 366 inst = emit(GS_OPCODE_FF_SYNC, 367 dst_reg(this->temp), this->prim_count, this->svbi); 368 } else { 369 inst = emit(GS_OPCODE_FF_SYNC, 370 dst_reg(this->temp), this->prim_count, brw_imm_ud(0u)); 371 } 372 inst->base_mrf = base_mrf; 373 374 /* Loop over all buffered vertices and emit URB write messages */ 375 this->current_annotation = "gen6 thread end: urb writes init"; 376 src_reg vertex(this, glsl_type::uint_type); 377 emit(MOV(dst_reg(vertex), brw_imm_ud(0u))); 378 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); 379 380 this->current_annotation = "gen6 thread end: urb writes"; 381 emit(BRW_OPCODE_DO); 382 { 383 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE)); 384 inst = emit(BRW_OPCODE_BREAK); 385 inst->predicate = BRW_PREDICATE_NORMAL; 386 387 /* First we prepare the message header */ 388 emit_urb_write_header(base_mrf); 389 390 /* Then add vertex data to the message in interleaved fashion */ 391 int slot = 0; 392 bool complete = false; 393 do { 394 int mrf = base_mrf + 1; 395 396 /* URB offset is in URB row increments, and each of our MRFs is half 397 * of one of those, since we're doing interleaved writes. 398 */ 399 int urb_offset = slot / 2; 400 401 for (; slot < prog_data->vue_map.num_slots; ++slot) { 402 int varying = prog_data->vue_map.slot_to_varying[slot]; 403 current_annotation = output_reg_annotation[varying]; 404 405 /* Compute offset of this slot for the current vertex 406 * in vertex_output 407 */ 408 src_reg data(this->vertex_output); 409 data.reladdr = ralloc(mem_ctx, src_reg); 410 memcpy(data.reladdr, &this->vertex_output_offset, 411 sizeof(src_reg)); 412 413 /* Copy this slot to the appropriate message register */ 414 dst_reg reg = dst_reg(MRF, mrf); 415 reg.type = output_reg[varying][0].type; 416 data.type = reg.type; 417 vec4_instruction *inst = emit(MOV(reg, data)); 418 inst->force_writemask_all = true; 419 420 mrf++; 421 emit(ADD(dst_reg(this->vertex_output_offset), 422 this->vertex_output_offset, brw_imm_ud(1u))); 423 424 /* If this was max_usable_mrf, we can't fit anything more into 425 * this URB WRITE. Same if we reached the max. message length. 426 */ 427 if (mrf > max_usable_mrf || 428 align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { 429 slot++; 430 break; 431 } 432 } 433 434 complete = slot >= prog_data->vue_map.num_slots; 435 emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset); 436 } while (!complete); 437 438 /* Skip over the flags data item so that vertex_output_offset points 439 * to the first data item of the next vertex, so that we can start 440 * writing the next vertex. 441 */ 442 emit(ADD(dst_reg(this->vertex_output_offset), 443 this->vertex_output_offset, brw_imm_ud(1u))); 444 445 emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u))); 446 } 447 emit(BRW_OPCODE_WHILE); 448 449 if (prog->info.has_transform_feedback_varyings) 450 xfb_write(); 451 } 452 emit(BRW_OPCODE_ENDIF); 453 454 /* Finally, emit EOT message. 455 * 456 * In gen6 we need to end the thread differently depending on whether we have 457 * emitted at least one vertex or not. In case we did, the EOT message must 458 * always include the COMPLETE flag or else the GPU hangs. If we have not 459 * produced any output we can't use the COMPLETE flag. 460 * 461 * However, this would lead us to end the program with an ENDIF opcode, 462 * which we want to avoid, so what we do is that we always request a new 463 * VUE handle every time we do a URB WRITE, even for the last vertex we emit. 464 * With this we make sure that whether we have emitted at least one vertex 465 * or none at all, we have to finish the thread without writing to the URB, 466 * which works for both cases by setting the COMPLETE and UNUSED flags in 467 * the EOT message. 468 */ 469 this->current_annotation = "gen6 thread end: EOT"; 470 471 if (prog->info.has_transform_feedback_varyings) { 472 /* When emitting EOT, set SONumPrimsWritten Increment Value. */ 473 src_reg data(this, glsl_type::uint_type); 474 emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu))); 475 emit(SHL(dst_reg(data), data, brw_imm_ud(16u))); 476 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data); 477 } 478 479 vec4_instruction *inst = emit(GS_OPCODE_THREAD_END); 480 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED; 481 inst->base_mrf = base_mrf; 482 inst->mlen = 1; 483 } 484 485 void 486 gen6_gs_visitor::setup_payload() 487 { 488 int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES]; 489 490 /* Attributes are going to be interleaved, so one register contains two 491 * attribute slots. 492 */ 493 int attributes_per_reg = 2; 494 495 /* If a geometry shader tries to read from an input that wasn't written by 496 * the vertex shader, that produces undefined results, but it shouldn't 497 * crash anything. So initialize attribute_map to zeros--that ensures that 498 * these undefined results are read from r0. 499 */ 500 memset(attribute_map, 0, sizeof(attribute_map)); 501 502 int reg = 0; 503 504 /* The payload always contains important data in r0. */ 505 reg++; 506 507 /* r1 is always part of the payload and it holds information relevant 508 * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in 509 * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID 510 * information (and move the original value to a virtual register if 511 * necessary). 512 */ 513 if (gs_prog_data->include_primitive_id) 514 attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg; 515 reg++; 516 517 reg = setup_uniforms(reg); 518 519 reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg); 520 521 lower_attributes_to_hw_regs(attribute_map, true); 522 523 this->first_non_payload_grf = reg; 524 } 525 526 void 527 gen6_gs_visitor::xfb_setup() 528 { 529 static const unsigned swizzle_for_offset[4] = { 530 BRW_SWIZZLE4(0, 1, 2, 3), 531 BRW_SWIZZLE4(1, 2, 3, 3), 532 BRW_SWIZZLE4(2, 3, 3, 3), 533 BRW_SWIZZLE4(3, 3, 3, 3) 534 }; 535 536 const struct gl_transform_feedback_info *linked_xfb_info = 537 this->prog->sh.LinkedTransformFeedback; 538 int i; 539 540 /* Make sure that the VUE slots won't overflow the unsigned chars in 541 * prog_data->transform_feedback_bindings[]. 542 */ 543 STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256); 544 545 /* Make sure that we don't need more binding table entries than we've 546 * set aside for use in transform feedback. (We shouldn't, since we 547 * set aside enough binding table entries to have one per component). 548 */ 549 assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS); 550 551 gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs; 552 for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) { 553 gs_prog_data->transform_feedback_bindings[i] = 554 linked_xfb_info->Outputs[i].OutputRegister; 555 gs_prog_data->transform_feedback_swizzles[i] = 556 swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset]; 557 } 558 } 559 560 void 561 gen6_gs_visitor::xfb_write() 562 { 563 unsigned num_verts; 564 565 if (!gs_prog_data->num_transform_feedback_bindings) 566 return; 567 568 switch (gs_prog_data->output_topology) { 569 case _3DPRIM_POINTLIST: 570 num_verts = 1; 571 break; 572 case _3DPRIM_LINELIST: 573 case _3DPRIM_LINESTRIP: 574 case _3DPRIM_LINELOOP: 575 num_verts = 2; 576 break; 577 case _3DPRIM_TRILIST: 578 case _3DPRIM_TRIFAN: 579 case _3DPRIM_TRISTRIP: 580 case _3DPRIM_RECTLIST: 581 num_verts = 3; 582 break; 583 case _3DPRIM_QUADLIST: 584 case _3DPRIM_QUADSTRIP: 585 case _3DPRIM_POLYGON: 586 num_verts = 3; 587 break; 588 default: 589 unreachable("Unexpected primitive type in Gen6 SOL program."); 590 } 591 592 this->current_annotation = "gen6 thread end: svb writes init"; 593 594 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); 595 emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u))); 596 597 /* Check that at least one primitive can be written 598 * 599 * Note: since we use the binding table to keep track of buffer offsets 600 * and stride, the GS doesn't need to keep track of a separate pointer 601 * into each buffer; it uses a single pointer which increments by 1 for 602 * each vertex. So we use SVBI0 for this pointer, regardless of whether 603 * transform feedback is in interleaved or separate attribs mode. 604 */ 605 src_reg sol_temp(this, glsl_type::uvec4_type); 606 emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts))); 607 608 /* Compare SVBI calculated number with the maximum value, which is 609 * in R1.4 (previously saved in this->max_svbi) for gen6. 610 */ 611 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); 612 emit(IF(BRW_PREDICATE_NORMAL)); 613 { 614 vec4_instruction *inst = emit(MOV(dst_reg(destination_indices), 615 brw_imm_vf4(brw_float_to_vf(0.0), 616 brw_float_to_vf(1.0), 617 brw_float_to_vf(2.0), 618 brw_float_to_vf(0.0)))); 619 inst->force_writemask_all = true; 620 621 emit(ADD(dst_reg(this->destination_indices), 622 this->destination_indices, 623 this->svbi)); 624 } 625 emit(BRW_OPCODE_ENDIF); 626 627 /* Write transform feedback data for all processed vertices. */ 628 for (int i = 0; i < (int)nir->info->gs.vertices_out; i++) { 629 emit(MOV(dst_reg(sol_temp), brw_imm_d(i))); 630 emit(CMP(dst_null_d(), sol_temp, this->vertex_count, 631 BRW_CONDITIONAL_L)); 632 emit(IF(BRW_PREDICATE_NORMAL)); 633 { 634 xfb_program(i, num_verts); 635 } 636 emit(BRW_OPCODE_ENDIF); 637 } 638 } 639 640 void 641 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts) 642 { 643 unsigned binding; 644 unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings; 645 src_reg sol_temp(this, glsl_type::uvec4_type); 646 647 /* Check for buffer overflow: we need room to write the complete primitive 648 * (all vertices). Otherwise, avoid writing any vertices for it 649 */ 650 emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u))); 651 emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts))); 652 emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi)); 653 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); 654 emit(IF(BRW_PREDICATE_NORMAL)); 655 { 656 /* Avoid overwriting MRF 1 as it is used as URB write message header */ 657 dst_reg mrf_reg(MRF, 2); 658 659 this->current_annotation = "gen6: emit SOL vertex data"; 660 /* For each vertex, generate code to output each varying using the 661 * appropriate binding table entry. 662 */ 663 for (binding = 0; binding < num_bindings; ++binding) { 664 unsigned char varying = 665 gs_prog_data->transform_feedback_bindings[binding]; 666 667 /* Set up the correct destination index for this vertex */ 668 vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX, 669 mrf_reg, 670 this->destination_indices); 671 inst->sol_vertex = vertex % num_verts; 672 673 /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: 674 * 675 * "Prior to End of Thread with a URB_WRITE, the kernel must 676 * ensure that all writes are complete by sending the final 677 * write as a committed write." 678 */ 679 bool final_write = binding == (unsigned) num_bindings - 1 && 680 inst->sol_vertex == num_verts - 1; 681 682 /* Compute offset of this varying for the current vertex 683 * in vertex_output 684 */ 685 this->current_annotation = output_reg_annotation[varying]; 686 src_reg data(this->vertex_output); 687 data.reladdr = ralloc(mem_ctx, src_reg); 688 int offset = get_vertex_output_offset_for_varying(vertex, varying); 689 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset))); 690 memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 691 data.type = output_reg[varying][0].type; 692 693 /* PSIZ, LAYER and VIEWPORT are packed in different channels of the 694 * same slot, so make sure we write the appropriate channel 695 */ 696 if (varying == VARYING_SLOT_PSIZ) 697 data.swizzle = BRW_SWIZZLE_WWWW; 698 else if (varying == VARYING_SLOT_LAYER) 699 data.swizzle = BRW_SWIZZLE_YYYY; 700 else if (varying == VARYING_SLOT_VIEWPORT) 701 data.swizzle = BRW_SWIZZLE_ZZZZ; 702 else 703 data.swizzle = gs_prog_data->transform_feedback_swizzles[binding]; 704 705 /* Write data */ 706 inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp); 707 inst->sol_binding = binding; 708 inst->sol_final_write = final_write; 709 710 if (final_write) { 711 /* This is the last vertex of the primitive, then increment 712 * SO num primitive counter and destination indices. 713 */ 714 emit(ADD(dst_reg(this->destination_indices), 715 this->destination_indices, 716 brw_imm_ud(num_verts))); 717 emit(ADD(dst_reg(this->sol_prim_written), 718 this->sol_prim_written, brw_imm_ud(1u))); 719 } 720 721 } 722 this->current_annotation = NULL; 723 } 724 emit(BRW_OPCODE_ENDIF); 725 } 726 727 int 728 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying) 729 { 730 /* Find the output slot assigned to this varying. 731 * 732 * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot 733 * as VARYING_SLOT_PSIZ. 734 */ 735 if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) 736 varying = VARYING_SLOT_PSIZ; 737 int slot = prog_data->vue_map.varying_to_slot[varying]; 738 739 if (slot < 0) { 740 /* This varying does not exist in the VUE so we are not writing to it 741 * and its value is undefined. We still want to return a valid offset 742 * into vertex_output though, to prevent any out-of-bound accesses into 743 * the vertex_output array. Since the value for this varying is undefined 744 * we don't really care for the value we assign to it, so any offset 745 * within the limits of vertex_output will do. 746 */ 747 slot = 0; 748 } 749 750 return vertex * (prog_data->vue_map.num_slots + 1) + slot; 751 } 752 753 } /* namespace brw */ 754