1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file pa.h 24 * 25 * @brief Definitions for primitive assembly. 26 * N primitives are assembled at a time, where N is the SIMD width. 27 * A state machine, that is specific for a given topology, drives the 28 * assembly of vertices into triangles. 29 * 30 ******************************************************************************/ 31 #pragma once 32 33 #include "frontend.h" 34 35 struct PA_STATE 36 { 37 DRAW_CONTEXT *pDC{ nullptr }; // draw context 38 uint8_t* pStreamBase{ nullptr }; // vertex stream 39 uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts 40 41 // The topology the binner will use. In some cases the FE changes the topology from the api state. 42 PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN }; 43 44 PA_STATE() {} 45 PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) : 46 pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {} 47 48 virtual bool HasWork() = 0; 49 virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; 50 virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; 51 virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0; 52 virtual bool NextPrim() = 0; 53 virtual simdvertex& GetNextVsOutput() = 0; 54 virtual bool GetNextStreamOutput() = 0; 55 virtual simdmask& GetNextVsIndices() = 0; 56 virtual uint32_t NumPrims() = 0; 57 virtual void Reset() = 0; 58 virtual simdscalari GetPrimID(uint32_t startID) = 0; 59 }; 60 61 // The Optimized PA is a state machine that assembles triangles from vertex shader simd 62 // output. Here is the sequence 63 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd). 64 // 2. Execute PA function to assemble and bin triangles. 65 // a. The PA function is a set of functions that collectively make up the 66 // state machine for a given topology. 67 // 1. We use a state index to track which PA function to call. 68 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle. 69 // 1. We call this the current and previous simd vertex. 70 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In 71 // order to assemble the second triangle, for a triangle list, we'll need the 72 // last vertex from the previous simd and the first 2 vertices from the current simd. 73 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices. 74 // 75 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without 76 // cuts 77 struct PA_STATE_OPT : public PA_STATE 78 { 79 simdvertex leadingVertex; // For tri-fan 80 uint32_t numPrims{ 0 }; // Total number of primitives for draw. 81 uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives. 82 83 uint32_t numSimdPrims{ 0 }; // Number of prims in current simd. 84 85 uint32_t cur{ 0 }; // index to current VS output. 86 uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state. 87 uint32_t first{ 0 }; // index to first VS output. Used for trifan. 88 89 uint32_t counter{ 0 }; // state counter 90 bool reset{ false }; // reset state 91 92 uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2}) 93 simdscalari primID; 94 95 typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); 96 typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); 97 98 PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles. 99 PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle. 100 PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset 101 102 // state used to advance the PA when Next is called 103 PFN_PA_FUNC pfnPaNextFunc{ nullptr }; 104 uint32_t nextNumSimdPrims{ 0 }; 105 uint32_t nextNumPrimsIncrement{ 0 }; 106 bool nextReset{ false }; 107 bool isStreaming{ false }; 108 109 simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function 110 111 PA_STATE_OPT() {} 112 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, 113 bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); 114 115 bool HasWork() 116 { 117 return (this->numPrimsComplete < this->numPrims) ? true : false; 118 } 119 120 simdvector& GetSimdVector(uint32_t index, uint32_t slot) 121 { 122 simdvertex* pVertex = (simdvertex*)pStreamBase; 123 return pVertex[index].attrib[slot]; 124 } 125 126 // Assembles 4 triangles. Each simdvector is a single vertex from 4 127 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. 128 bool Assemble(uint32_t slot, simdvector verts[]) 129 { 130 return this->pfnPaFunc(*this, slot, verts); 131 } 132 133 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). 134 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) 135 { 136 return this->pfnPaSingleFunc(*this, slot, primIndex, verts); 137 } 138 139 bool NextPrim() 140 { 141 this->pfnPaFunc = this->pfnPaNextFunc; 142 this->numSimdPrims = this->nextNumSimdPrims; 143 this->numPrimsComplete += this->nextNumPrimsIncrement; 144 this->reset = this->nextReset; 145 146 if (this->isStreaming) 147 { 148 this->reset = false; 149 } 150 151 bool morePrims = false; 152 153 if (this->numSimdPrims > 0) 154 { 155 morePrims = true; 156 this->numSimdPrims--; 157 } 158 else 159 { 160 this->counter = (this->reset) ? 0 : (this->counter + 1); 161 this->reset = false; 162 } 163 164 this->pfnPaFunc = this->pfnPaNextFunc; 165 166 if (!HasWork()) 167 { 168 morePrims = false; // no more to do 169 } 170 171 return morePrims; 172 } 173 174 simdvertex& GetNextVsOutput() 175 { 176 // increment cur and prev indices 177 const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH; 178 this->prev = this->cur; // prev is undefined for first state. 179 this->cur = this->counter % numSimdVerts; 180 181 simdvertex* pVertex = (simdvertex*)pStreamBase; 182 return pVertex[this->cur]; 183 } 184 185 simdmask& GetNextVsIndices() 186 { 187 // unused in optimized PA, pass tmp buffer back 188 return tmpIndices; 189 } 190 191 bool GetNextStreamOutput() 192 { 193 this->prev = this->cur; 194 this->cur = this->counter; 195 196 return HasWork(); 197 } 198 199 uint32_t NumPrims() 200 { 201 return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ? 202 (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH; 203 } 204 205 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, 206 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, 207 uint32_t numSimdPrims = 0, 208 uint32_t numPrimsIncrement = 0, 209 bool reset = false) 210 { 211 this->pfnPaNextFunc = pfnPaNextFunc; 212 this->nextNumSimdPrims = numSimdPrims; 213 this->nextNumPrimsIncrement = numPrimsIncrement; 214 this->nextReset = reset; 215 216 this->pfnPaSingleFunc = pfnPaNextSingleFunc; 217 } 218 219 void Reset() 220 { 221 this->pfnPaFunc = this->pfnPaFuncReset; 222 this->numPrimsComplete = 0; 223 this->numSimdPrims = 0; 224 this->cur = 0; 225 this->prev = 0; 226 this->first = 0; 227 this->counter = 0; 228 this->reset = false; 229 } 230 231 simdscalari GetPrimID(uint32_t startID) 232 { 233 return _simd_add_epi32(this->primID, 234 _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH))); 235 } 236 }; 237 238 // helper C wrappers to avoid having to rewrite all the PA topology state functions 239 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, 240 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, 241 uint32_t numSimdPrims = 0, 242 uint32_t numPrimsIncrement = 0, 243 bool reset = false) 244 { 245 return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); 246 } 247 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) 248 { 249 return pa.GetSimdVector(index, slot); 250 } 251 252 INLINE __m128 swizzleLane0(const simdvector &a) 253 { 254 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); 255 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); 256 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); 257 } 258 259 INLINE __m128 swizzleLane1(const simdvector &a) 260 { 261 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); 262 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); 263 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); 264 } 265 266 INLINE __m128 swizzleLane2(const simdvector &a) 267 { 268 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); 269 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); 270 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); 271 } 272 273 INLINE __m128 swizzleLane3(const simdvector &a) 274 { 275 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); 276 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); 277 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); 278 } 279 280 INLINE __m128 swizzleLane4(const simdvector &a) 281 { 282 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); 283 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); 284 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); 285 286 } 287 288 INLINE __m128 swizzleLane5(const simdvector &a) 289 { 290 simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); 291 simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); 292 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); 293 } 294 295 INLINE __m128 swizzleLane6(const simdvector &a) 296 { 297 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); 298 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); 299 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); 300 } 301 302 INLINE __m128 swizzleLane7(const simdvector &a) 303 { 304 simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); 305 simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); 306 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); 307 } 308 309 INLINE __m128 swizzleLaneN(const simdvector &a, int lane) 310 { 311 switch (lane) { 312 case 0: 313 return swizzleLane0(a); 314 case 1: 315 return swizzleLane1(a); 316 case 2: 317 return swizzleLane2(a); 318 case 3: 319 return swizzleLane3(a); 320 case 4: 321 return swizzleLane4(a); 322 case 5: 323 return swizzleLane5(a); 324 case 6: 325 return swizzleLane6(a); 326 case 7: 327 return swizzleLane7(a); 328 default: 329 return _mm_setzero_ps(); 330 } 331 } 332 333 // Cut-aware primitive assembler. 334 struct PA_STATE_CUT : public PA_STATE 335 { 336 simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex 337 uint32_t numVerts{ 0 }; // number of vertices available in buffer store 338 uint32_t numAttribs{ 0 }; // number of attributes 339 int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled 340 uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw 341 OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather 342 simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd 343 uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled 344 uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store 345 uint32_t tailVertex{ 0 }; // beginning vertex currently assembling 346 uint32_t curVertex{ 0 }; // current unprocessed vertex 347 uint32_t startPrimId{ 0 }; // starting prim id 348 simdscalari vPrimId; // vector of prim ID 349 bool needOffsets{ false }; // need to compute gather offsets for current SIMD 350 uint32_t vertsPerPrim{ 0 }; 351 simdvertex tmpVertex; // temporary simdvertex for unimplemented API 352 bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they 353 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored 354 // while the GS sends valid verts for every index 355 // Topology state tracking 356 uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; 357 uint32_t curIndex{ 0 }; 358 bool reverseWinding{ false }; // indicates reverse winding for strips 359 int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj 360 361 typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish); 362 PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert 363 364 PA_STATE_CUT() {} 365 PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, 366 uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts) 367 : PA_STATE(pDC, in_pStream, in_streamSizeInVerts) 368 { 369 numVerts = in_streamSizeInVerts; 370 numAttribs = in_numAttribs; 371 binTopology = topo; 372 needOffsets = false; 373 processCutVerts = in_processCutVerts; 374 375 numVertsToAssemble = numRemainingVerts = in_numVerts; 376 numPrimsAssembled = 0; 377 headVertex = tailVertex = curVertex = 0; 378 379 curIndex = 0; 380 pCutIndices = in_pIndices; 381 memset(indices, 0, sizeof(indices)); 382 vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 383 reverseWinding = false; 384 adjExtraVert = -1; 385 386 bool gsEnabled = pDC->pState->state.gsState.gsEnable; 387 vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); 388 389 switch (topo) 390 { 391 case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break; 392 case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break; 393 case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break; 394 case TOP_TRI_STRIP_ADJ: if (gsEnabled) 395 { 396 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ; 397 } 398 else 399 { 400 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ; 401 } 402 break; 403 404 case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break; 405 case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break; 406 case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break; 407 case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break; 408 case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break; 409 default: assert(0 && "Unimplemented topology"); 410 } 411 } 412 413 simdvertex& GetNextVsOutput() 414 { 415 uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; 416 this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts; 417 this->needOffsets = true; 418 return ((simdvertex*)pStreamBase)[vertexIndex]; 419 } 420 421 simdmask& GetNextVsIndices() 422 { 423 uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; 424 simdmask* pCurCutIndex = this->pCutIndices + vertexIndex; 425 return *pCurCutIndex; 426 } 427 428 simdvector& GetSimdVector(uint32_t index, uint32_t slot) 429 { 430 // unused 431 SWR_ASSERT(0 && "Not implemented"); 432 return this->tmpVertex.attrib[0]; 433 } 434 435 bool GetNextStreamOutput() 436 { 437 this->headVertex += KNOB_SIMD_WIDTH; 438 this->needOffsets = true; 439 return HasWork(); 440 } 441 442 simdscalari GetPrimID(uint32_t startID) 443 { 444 return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); 445 } 446 447 void Reset() 448 { 449 this->numRemainingVerts = this->numVertsToAssemble; 450 this->numPrimsAssembled = 0; 451 this->curIndex = 0; 452 this->curVertex = 0; 453 this->tailVertex = 0; 454 this->headVertex = 0; 455 this->reverseWinding = false; 456 this->adjExtraVert = -1; 457 this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 458 } 459 460 bool HasWork() 461 { 462 return this->numRemainingVerts > 0 || this->adjExtraVert != -1; 463 } 464 465 bool IsVertexStoreFull() 466 { 467 return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex; 468 } 469 470 void RestartTopology() 471 { 472 this->curIndex = 0; 473 this->reverseWinding = false; 474 this->adjExtraVert = -1; 475 } 476 477 bool IsCutIndex(uint32_t vertex) 478 { 479 uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH; 480 uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1); 481 return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1; 482 } 483 484 // iterates across the unprocessed verts until we hit the end or we 485 // have assembled SIMD prims 486 void ProcessVerts() 487 { 488 while (this->numPrimsAssembled != KNOB_SIMD_WIDTH && 489 this->numRemainingVerts > 0 && 490 this->curVertex != this->headVertex) 491 { 492 // if cut index, restart topology 493 if (IsCutIndex(this->curVertex)) 494 { 495 if (this->processCutVerts) 496 { 497 (this->*pfnPa)(this->curVertex, false); 498 } 499 // finish off tri strip w/ adj before restarting topo 500 if (this->adjExtraVert != -1) 501 { 502 (this->*pfnPa)(this->curVertex, true); 503 } 504 RestartTopology(); 505 } 506 else 507 { 508 (this->*pfnPa)(this->curVertex, false); 509 } 510 511 this->curVertex++; 512 if (this->curVertex >= this->numVerts) { 513 this->curVertex = 0; 514 } 515 this->numRemainingVerts--; 516 } 517 518 // special case last primitive for tri strip w/ adj 519 if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1) 520 { 521 (this->*pfnPa)(this->curVertex, true); 522 } 523 } 524 525 void Advance() 526 { 527 // done with current batch 528 // advance tail to the current unsubmitted vertex 529 this->tailVertex = this->curVertex; 530 this->numPrimsAssembled = 0; 531 this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH)); 532 } 533 534 bool NextPrim() 535 { 536 // if we've assembled enough prims, we can advance to the next set of verts 537 if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0) 538 { 539 Advance(); 540 } 541 return false; 542 } 543 544 void ComputeOffsets() 545 { 546 for (uint32_t v = 0; v < this->vertsPerPrim; ++v) 547 { 548 simdscalari vIndices = *(simdscalari*)&this->indices[v][0]; 549 550 // step to simdvertex batch 551 const uint32_t simdShift = 3; // @todo make knob 552 simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift); 553 this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex))); 554 555 // step to index 556 const uint32_t simdMask = 0x7; // @todo make knob 557 simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask)); 558 this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); 559 } 560 } 561 562 bool Assemble(uint32_t slot, simdvector result[]) 563 { 564 // process any outstanding verts 565 ProcessVerts(); 566 567 // return false if we don't have enough prims assembled 568 if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0) 569 { 570 return false; 571 } 572 573 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble 574 if (this->needOffsets) 575 { 576 ComputeOffsets(); 577 this->needOffsets = false; 578 } 579 580 for (uint32_t v = 0; v < this->vertsPerPrim; ++v) 581 { 582 simdscalari offsets = this->vOffsets[v]; 583 584 // step to attribute 585 offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector))); 586 587 float* pBase = (float*)this->pStreamBase; 588 for (uint32_t c = 0; c < 4; ++c) 589 { 590 result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); 591 592 // move base to next component 593 pBase += KNOB_SIMD_WIDTH; 594 } 595 } 596 597 return true; 598 } 599 600 void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3]) 601 { 602 // move to slot 603 for (uint32_t v = 0; v < this->vertsPerPrim; ++v) 604 { 605 uint32_t* pOffset = (uint32_t*)&this->vOffsets[v]; 606 uint32_t offset = pOffset[triIndex]; 607 offset += sizeof(simdvector) * slot; 608 float* pVert = (float*)&tri[v]; 609 for (uint32_t c = 0; c < 4; ++c) 610 { 611 float* pComponent = (float*)(this->pStreamBase + offset); 612 pVert[c] = *pComponent; 613 offset += KNOB_SIMD_WIDTH * sizeof(float); 614 } 615 } 616 } 617 618 uint32_t NumPrims() 619 { 620 return this->numPrimsAssembled; 621 } 622 623 // Per-topology functions 624 void ProcessVertTriStrip(uint32_t index, bool finish) 625 { 626 this->vert[this->curIndex] = index; 627 this->curIndex++; 628 if (this->curIndex == 3) 629 { 630 // assembled enough verts for prim, add to gather indices 631 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 632 if (reverseWinding) 633 { 634 this->indices[1][this->numPrimsAssembled] = this->vert[2]; 635 this->indices[2][this->numPrimsAssembled] = this->vert[1]; 636 } 637 else 638 { 639 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 640 this->indices[2][this->numPrimsAssembled] = this->vert[2]; 641 } 642 643 // increment numPrimsAssembled 644 this->numPrimsAssembled++; 645 646 // set up next prim state 647 this->vert[0] = this->vert[1]; 648 this->vert[1] = this->vert[2]; 649 this->curIndex = 2; 650 this->reverseWinding ^= 1; 651 } 652 } 653 654 template<bool gsEnabled> 655 void AssembleTriStripAdj() 656 { 657 if (!gsEnabled) 658 { 659 this->vert[1] = this->vert[2]; 660 this->vert[2] = this->vert[4]; 661 662 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 663 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 664 this->indices[2][this->numPrimsAssembled] = this->vert[2]; 665 666 this->vert[4] = this->vert[2]; 667 this->vert[2] = this->vert[1]; 668 } 669 else 670 { 671 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 672 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 673 this->indices[2][this->numPrimsAssembled] = this->vert[2]; 674 this->indices[3][this->numPrimsAssembled] = this->vert[3]; 675 this->indices[4][this->numPrimsAssembled] = this->vert[4]; 676 this->indices[5][this->numPrimsAssembled] = this->vert[5]; 677 } 678 this->numPrimsAssembled++; 679 } 680 681 682 template<bool gsEnabled> 683 void ProcessVertTriStripAdj(uint32_t index, bool finish) 684 { 685 // handle last primitive of tristrip 686 if (finish && this->adjExtraVert != -1) 687 { 688 this->vert[3] = this->adjExtraVert; 689 AssembleTriStripAdj<gsEnabled>(); 690 this->adjExtraVert = -1; 691 return; 692 } 693 694 switch (this->curIndex) 695 { 696 case 0: 697 case 1: 698 case 2: 699 case 4: 700 this->vert[this->curIndex] = index; 701 this->curIndex++; 702 break; 703 case 3: 704 this->vert[5] = index; 705 this->curIndex++; 706 break; 707 case 5: 708 if (this->adjExtraVert == -1) 709 { 710 this->adjExtraVert = index; 711 } 712 else 713 { 714 this->vert[3] = index; 715 if (!gsEnabled) 716 { 717 AssembleTriStripAdj<gsEnabled>(); 718 719 uint32_t nextTri[6]; 720 if (this->reverseWinding) 721 { 722 nextTri[0] = this->vert[4]; 723 nextTri[1] = this->vert[0]; 724 nextTri[2] = this->vert[2]; 725 nextTri[4] = this->vert[3]; 726 nextTri[5] = this->adjExtraVert; 727 } 728 else 729 { 730 nextTri[0] = this->vert[2]; 731 nextTri[1] = this->adjExtraVert; 732 nextTri[2] = this->vert[3]; 733 nextTri[4] = this->vert[4]; 734 nextTri[5] = this->vert[0]; 735 } 736 for (uint32_t i = 0; i < 6; ++i) 737 { 738 this->vert[i] = nextTri[i]; 739 } 740 741 this->adjExtraVert = -1; 742 this->reverseWinding ^= 1; 743 } 744 else 745 { 746 this->curIndex++; 747 } 748 } 749 break; 750 case 6: 751 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!"); 752 AssembleTriStripAdj<gsEnabled>(); 753 754 uint32_t nextTri[6]; 755 if (this->reverseWinding) 756 { 757 nextTri[0] = this->vert[4]; 758 nextTri[1] = this->vert[0]; 759 nextTri[2] = this->vert[2]; 760 nextTri[4] = this->vert[3]; 761 nextTri[5] = this->adjExtraVert; 762 } 763 else 764 { 765 nextTri[0] = this->vert[2]; 766 nextTri[1] = this->adjExtraVert; 767 nextTri[2] = this->vert[3]; 768 nextTri[4] = this->vert[4]; 769 nextTri[5] = this->vert[0]; 770 } 771 for (uint32_t i = 0; i < 6; ++i) 772 { 773 this->vert[i] = nextTri[i]; 774 } 775 this->reverseWinding ^= 1; 776 this->adjExtraVert = index; 777 this->curIndex--; 778 break; 779 } 780 } 781 782 void ProcessVertTriList(uint32_t index, bool finish) 783 { 784 this->vert[this->curIndex] = index; 785 this->curIndex++; 786 if (this->curIndex == 3) 787 { 788 // assembled enough verts for prim, add to gather indices 789 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 790 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 791 this->indices[2][this->numPrimsAssembled] = this->vert[2]; 792 793 // increment numPrimsAssembled 794 this->numPrimsAssembled++; 795 796 // set up next prim state 797 this->curIndex = 0; 798 } 799 } 800 801 void ProcessVertTriListAdj(uint32_t index, bool finish) 802 { 803 this->vert[this->curIndex] = index; 804 this->curIndex++; 805 if (this->curIndex == 6) 806 { 807 // assembled enough verts for prim, add to gather indices 808 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 809 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 810 this->indices[2][this->numPrimsAssembled] = this->vert[2]; 811 this->indices[3][this->numPrimsAssembled] = this->vert[3]; 812 this->indices[4][this->numPrimsAssembled] = this->vert[4]; 813 this->indices[5][this->numPrimsAssembled] = this->vert[5]; 814 815 // increment numPrimsAssembled 816 this->numPrimsAssembled++; 817 818 // set up next prim state 819 this->curIndex = 0; 820 } 821 } 822 823 void ProcessVertTriListAdjNoGs(uint32_t index, bool finish) 824 { 825 this->vert[this->curIndex] = index; 826 this->curIndex++; 827 if (this->curIndex == 6) 828 { 829 // assembled enough verts for prim, add to gather indices 830 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 831 this->indices[1][this->numPrimsAssembled] = this->vert[2]; 832 this->indices[2][this->numPrimsAssembled] = this->vert[4]; 833 834 // increment numPrimsAssembled 835 this->numPrimsAssembled++; 836 837 // set up next prim state 838 this->curIndex = 0; 839 } 840 } 841 842 843 void ProcessVertLineList(uint32_t index, bool finish) 844 { 845 this->vert[this->curIndex] = index; 846 this->curIndex++; 847 if (this->curIndex == 2) 848 { 849 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 850 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 851 852 this->numPrimsAssembled++; 853 this->curIndex = 0; 854 } 855 } 856 857 void ProcessVertLineStrip(uint32_t index, bool finish) 858 { 859 this->vert[this->curIndex] = index; 860 this->curIndex++; 861 if (this->curIndex == 2) 862 { 863 // assembled enough verts for prim, add to gather indices 864 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 865 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 866 867 // increment numPrimsAssembled 868 this->numPrimsAssembled++; 869 870 // set up next prim state 871 this->vert[0] = this->vert[1]; 872 this->curIndex = 1; 873 } 874 } 875 876 void ProcessVertLineStripAdj(uint32_t index, bool finish) 877 { 878 this->vert[this->curIndex] = index; 879 this->curIndex++; 880 if (this->curIndex == 4) 881 { 882 // assembled enough verts for prim, add to gather indices 883 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 884 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 885 this->indices[2][this->numPrimsAssembled] = this->vert[2]; 886 this->indices[3][this->numPrimsAssembled] = this->vert[3]; 887 888 // increment numPrimsAssembled 889 this->numPrimsAssembled++; 890 891 // set up next prim state 892 this->vert[0] = this->vert[1]; 893 this->vert[1] = this->vert[2]; 894 this->vert[2] = this->vert[3]; 895 this->curIndex = 3; 896 } 897 } 898 899 void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish) 900 { 901 this->vert[this->curIndex] = index; 902 this->curIndex++; 903 if (this->curIndex == 4) 904 { 905 // assembled enough verts for prim, add to gather indices 906 this->indices[0][this->numPrimsAssembled] = this->vert[1]; 907 this->indices[1][this->numPrimsAssembled] = this->vert[2]; 908 909 // increment numPrimsAssembled 910 this->numPrimsAssembled++; 911 912 // set up next prim state 913 this->vert[0] = this->vert[1]; 914 this->vert[1] = this->vert[2]; 915 this->vert[2] = this->vert[3]; 916 this->curIndex = 3; 917 } 918 } 919 920 void ProcessVertLineListAdj(uint32_t index, bool finish) 921 { 922 this->vert[this->curIndex] = index; 923 this->curIndex++; 924 if (this->curIndex == 4) 925 { 926 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 927 this->indices[1][this->numPrimsAssembled] = this->vert[1]; 928 this->indices[2][this->numPrimsAssembled] = this->vert[2]; 929 this->indices[3][this->numPrimsAssembled] = this->vert[3]; 930 931 this->numPrimsAssembled++; 932 this->curIndex = 0; 933 } 934 } 935 936 void ProcessVertLineListAdjNoGs(uint32_t index, bool finish) 937 { 938 this->vert[this->curIndex] = index; 939 this->curIndex++; 940 if (this->curIndex == 4) 941 { 942 this->indices[0][this->numPrimsAssembled] = this->vert[1]; 943 this->indices[1][this->numPrimsAssembled] = this->vert[2]; 944 945 this->numPrimsAssembled++; 946 this->curIndex = 0; 947 } 948 } 949 950 void ProcessVertPointList(uint32_t index, bool finish) 951 { 952 this->vert[this->curIndex] = index; 953 this->curIndex++; 954 if (this->curIndex == 1) 955 { 956 this->indices[0][this->numPrimsAssembled] = this->vert[0]; 957 this->numPrimsAssembled++; 958 this->curIndex = 0; 959 } 960 } 961 }; 962 963 // Primitive Assembly for data output from the DomainShader. 964 struct PA_TESS : PA_STATE 965 { 966 PA_TESS( 967 DRAW_CONTEXT *in_pDC, 968 const simdscalar* in_pVertData, 969 uint32_t in_attributeStrideInVectors, 970 uint32_t in_numAttributes, 971 uint32_t* (&in_ppIndices)[3], 972 uint32_t in_numPrims, 973 PRIMITIVE_TOPOLOGY in_binTopology) : 974 975 PA_STATE(in_pDC, nullptr, 0), 976 m_pVertexData(in_pVertData), 977 m_attributeStrideInVectors(in_attributeStrideInVectors), 978 m_numAttributes(in_numAttributes), 979 m_numPrims(in_numPrims) 980 { 981 m_vPrimId = _simd_setzero_si(); 982 binTopology = in_binTopology; 983 m_ppIndices[0] = in_ppIndices[0]; 984 m_ppIndices[1] = in_ppIndices[1]; 985 m_ppIndices[2] = in_ppIndices[2]; 986 987 switch (binTopology) 988 { 989 case TOP_POINT_LIST: 990 m_numVertsPerPrim = 1; 991 break; 992 993 case TOP_LINE_LIST: 994 m_numVertsPerPrim = 2; 995 break; 996 997 case TOP_TRIANGLE_LIST: 998 m_numVertsPerPrim = 3; 999 break; 1000 1001 default: 1002 SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__); 1003 break; 1004 } 1005 } 1006 1007 bool HasWork() 1008 { 1009 return m_numPrims != 0; 1010 } 1011 1012 simdvector& GetSimdVector(uint32_t index, uint32_t slot) 1013 { 1014 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__); 1015 static simdvector junk; 1016 return junk; 1017 } 1018 1019 static simdscalari GenPrimMask(uint32_t numPrims) 1020 { 1021 SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH); 1022 #if KNOB_SIMD_WIDTH == 8 1023 static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = 1024 { 1025 -1, -1, -1, -1, -1, -1, -1, -1, 1026 0, 0, 0, 0, 0, 0, 0, 0 1027 }; 1028 #elif KNOB_SIMD_WIDTH == 16 1029 static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = 1030 { 1031 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 1033 }; 1034 #else 1035 #error "Help, help, I can't get up!" 1036 #endif 1037 1038 return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]); 1039 } 1040 1041 bool Assemble(uint32_t slot, simdvector verts[]) 1042 { 1043 static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented"); 1044 SWR_ASSERT(slot < m_numAttributes); 1045 1046 uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); 1047 if (0 == numPrimsToAssemble) 1048 { 1049 return false; 1050 } 1051 1052 simdscalari mask = GenPrimMask(numPrimsToAssemble); 1053 1054 const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; 1055 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) 1056 { 1057 simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]); 1058 1059 const float* pBase = pBaseAttrib; 1060 for (uint32_t c = 0; c < 4; ++c) 1061 { 1062 verts[i].v[c] = _simd_mask_i32gather_ps( 1063 _simd_setzero_ps(), 1064 pBase, 1065 indices, 1066 _simd_castsi_ps(mask), 1067 4 /* gcc doesn't like sizeof(float) */); 1068 pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; 1069 } 1070 } 1071 1072 return true; 1073 } 1074 1075 void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) 1076 { 1077 SWR_ASSERT(slot < m_numAttributes); 1078 SWR_ASSERT(primIndex < PA_TESS::NumPrims()); 1079 1080 const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; 1081 for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) 1082 { 1083 uint32_t index = m_ppIndices[i][primIndex]; 1084 const float* pVertData = pVertDataBase; 1085 float* pVert = (float*)&verts[i]; 1086 1087 for (uint32_t c = 0; c < 4; ++c) 1088 { 1089 pVert[c] = pVertData[index]; 1090 pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; 1091 } 1092 } 1093 } 1094 1095 bool NextPrim() 1096 { 1097 uint32_t numPrims = PA_TESS::NumPrims(); 1098 m_numPrims -= numPrims; 1099 m_ppIndices[0] += numPrims; 1100 m_ppIndices[1] += numPrims; 1101 m_ppIndices[2] += numPrims; 1102 1103 return HasWork(); 1104 } 1105 1106 simdvertex& GetNextVsOutput() 1107 { 1108 SWR_ASSERT(0, "%s", __FUNCTION__); 1109 static simdvertex junk; 1110 return junk; 1111 } 1112 1113 bool GetNextStreamOutput() 1114 { 1115 SWR_ASSERT(0, "%s", __FUNCTION__); 1116 return false; 1117 } 1118 1119 simdmask& GetNextVsIndices() 1120 { 1121 SWR_ASSERT(0, "%s", __FUNCTION__); 1122 static simdmask junk; 1123 return junk; 1124 } 1125 1126 uint32_t NumPrims() 1127 { 1128 return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH); 1129 } 1130 1131 void Reset() { SWR_ASSERT(0); }; 1132 1133 simdscalari GetPrimID(uint32_t startID) 1134 { 1135 return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); 1136 } 1137 1138 private: 1139 const simdscalar* m_pVertexData = nullptr; 1140 uint32_t m_attributeStrideInVectors = 0; 1141 uint32_t m_numAttributes = 0; 1142 uint32_t m_numPrims = 0; 1143 uint32_t* m_ppIndices[3]; 1144 1145 uint32_t m_numVertsPerPrim = 0; 1146 1147 simdscalari m_vPrimId; 1148 }; 1149 1150 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler 1151 // based on state. 1152 template <typename IsIndexedT, typename IsCutIndexEnabledT> 1153 struct PA_FACTORY 1154 { 1155 PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo) 1156 { 1157 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE 1158 const API_STATE& state = GetApiState(pDC); 1159 if ((IsIndexedT::value && IsCutIndexEnabledT::value && ( 1160 topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || 1161 topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP || 1162 topo == TOP_TRIANGLE_LIST)) || 1163 1164 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support 1165 // for them in the optimized PA 1166 (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)) 1167 { 1168 memset(&indexStore, 0, sizeof(indexStore)); 1169 uint32_t numAttribs = state.feNumAttributes; 1170 1171 new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, 1172 &this->indexStore[0], numVerts, numAttribs, state.topology, false); 1173 cutPA = true; 1174 } 1175 else 1176 #endif 1177 { 1178 uint32_t numPrims = GetNumPrims(in_topo, numVerts); 1179 new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false); 1180 cutPA = false; 1181 } 1182 1183 } 1184 1185 PA_STATE& GetPA() 1186 { 1187 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE 1188 if (cutPA) 1189 { 1190 return this->paCut; 1191 } 1192 else 1193 #endif 1194 { 1195 return this->paOpt; 1196 } 1197 } 1198 1199 PA_STATE_OPT paOpt; 1200 PA_STATE_CUT paCut; 1201 bool cutPA{ false }; 1202 1203 PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN }; 1204 1205 simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM]; 1206 simdmask indexStore[MAX_NUM_VERTS_PER_PRIM]; 1207 }; 1208