1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file frontend.cpp 24 * 25 * @brief Implementation for Frontend which handles vertex processing, 26 * primitive assembly, clipping, binning, etc. 27 * 28 ******************************************************************************/ 29 30 #include "api.h" 31 #include "frontend.h" 32 #include "backend.h" 33 #include "context.h" 34 #include "rdtsc_core.h" 35 #include "utils.h" 36 #include "threads.h" 37 #include "pa.h" 38 #include "clip.h" 39 #include "tilemgr.h" 40 #include "tessellator.h" 41 #include <limits> 42 #include <iostream> 43 44 ////////////////////////////////////////////////////////////////////////// 45 /// @brief Helper macro to generate a bitmask 46 static INLINE uint32_t GenMask(uint32_t numBits) 47 { 48 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); 49 return ((1U << numBits) - 1); 50 } 51 52 ////////////////////////////////////////////////////////////////////////// 53 /// @brief FE handler for SwrSync. 54 /// @param pContext - pointer to SWR context. 55 /// @param pDC - pointer to draw context. 56 /// @param workerId - thread's worker id. Even thread has a unique id. 57 /// @param pUserData - Pointer to user data passed back to sync callback. 58 /// @todo This should go away when we switch this to use compute threading. 59 void ProcessSync( 60 SWR_CONTEXT *pContext, 61 DRAW_CONTEXT *pDC, 62 uint32_t workerId, 63 void *pUserData) 64 { 65 BE_WORK work; 66 work.type = SYNC; 67 work.pfnWork = ProcessSyncBE; 68 69 MacroTileMgr *pTileMgr = pDC->pTileMgr; 70 pTileMgr->enqueue(0, 0, &work); 71 } 72 73 ////////////////////////////////////////////////////////////////////////// 74 /// @brief FE handler for SwrDestroyContext. 75 /// @param pContext - pointer to SWR context. 76 /// @param pDC - pointer to draw context. 77 /// @param workerId - thread's worker id. Even thread has a unique id. 78 /// @param pUserData - Pointer to user data passed back to sync callback. 79 void ProcessShutdown( 80 SWR_CONTEXT *pContext, 81 DRAW_CONTEXT *pDC, 82 uint32_t workerId, 83 void *pUserData) 84 { 85 BE_WORK work; 86 work.type = SHUTDOWN; 87 work.pfnWork = ProcessShutdownBE; 88 89 MacroTileMgr *pTileMgr = pDC->pTileMgr; 90 // Enqueue at least 1 work item for each worker thread 91 // account for number of numa nodes 92 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1; 93 94 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i) 95 { 96 for (uint32_t n = 0; n < numNumaNodes; ++n) 97 { 98 pTileMgr->enqueue(i, n, &work); 99 } 100 } 101 } 102 103 ////////////////////////////////////////////////////////////////////////// 104 /// @brief FE handler for SwrClearRenderTarget. 105 /// @param pContext - pointer to SWR context. 106 /// @param pDC - pointer to draw context. 107 /// @param workerId - thread's worker id. Even thread has a unique id. 108 /// @param pUserData - Pointer to user data passed back to clear callback. 109 /// @todo This should go away when we switch this to use compute threading. 110 void ProcessClear( 111 SWR_CONTEXT *pContext, 112 DRAW_CONTEXT *pDC, 113 uint32_t workerId, 114 void *pUserData) 115 { 116 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData; 117 MacroTileMgr *pTileMgr = pDC->pTileMgr; 118 119 // queue a clear to each macro tile 120 // compute macro tile bounds for the specified rect 121 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; 122 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; 123 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; 124 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; 125 126 BE_WORK work; 127 work.type = CLEAR; 128 work.pfnWork = ProcessClearBE; 129 work.desc.clear = *pDesc; 130 131 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) 132 { 133 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) 134 { 135 pTileMgr->enqueue(x, y, &work); 136 } 137 } 138 } 139 140 ////////////////////////////////////////////////////////////////////////// 141 /// @brief FE handler for SwrStoreTiles. 142 /// @param pContext - pointer to SWR context. 143 /// @param pDC - pointer to draw context. 144 /// @param workerId - thread's worker id. Even thread has a unique id. 145 /// @param pUserData - Pointer to user data passed back to callback. 146 /// @todo This should go away when we switch this to use compute threading. 147 void ProcessStoreTiles( 148 SWR_CONTEXT *pContext, 149 DRAW_CONTEXT *pDC, 150 uint32_t workerId, 151 void *pUserData) 152 { 153 AR_BEGIN(FEProcessStoreTiles, pDC->drawId); 154 MacroTileMgr *pTileMgr = pDC->pTileMgr; 155 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData; 156 157 // queue a store to each macro tile 158 // compute macro tile bounds for the specified rect 159 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; 160 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; 161 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; 162 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; 163 164 // store tiles 165 BE_WORK work; 166 work.type = STORETILES; 167 work.pfnWork = ProcessStoreTilesBE; 168 work.desc.storeTiles = *pDesc; 169 170 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) 171 { 172 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) 173 { 174 pTileMgr->enqueue(x, y, &work); 175 } 176 } 177 178 AR_END(FEProcessStoreTiles, 0); 179 } 180 181 ////////////////////////////////////////////////////////////////////////// 182 /// @brief FE handler for SwrInvalidateTiles. 183 /// @param pContext - pointer to SWR context. 184 /// @param pDC - pointer to draw context. 185 /// @param workerId - thread's worker id. Even thread has a unique id. 186 /// @param pUserData - Pointer to user data passed back to callback. 187 /// @todo This should go away when we switch this to use compute threading. 188 void ProcessDiscardInvalidateTiles( 189 SWR_CONTEXT *pContext, 190 DRAW_CONTEXT *pDC, 191 uint32_t workerId, 192 void *pUserData) 193 { 194 AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId); 195 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData; 196 MacroTileMgr *pTileMgr = pDC->pTileMgr; 197 198 // compute macro tile bounds for the specified rect 199 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM; 200 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1; 201 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM; 202 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1; 203 204 if (pDesc->fullTilesOnly == false) 205 { 206 // include partial tiles 207 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; 208 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; 209 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; 210 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; 211 } 212 213 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X); 214 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y); 215 216 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X); 217 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y); 218 219 // load tiles 220 BE_WORK work; 221 work.type = DISCARDINVALIDATETILES; 222 work.pfnWork = ProcessDiscardInvalidateTilesBE; 223 work.desc.discardInvalidateTiles = *pDesc; 224 225 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) 226 { 227 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) 228 { 229 pTileMgr->enqueue(x, y, &work); 230 } 231 } 232 233 AR_END(FEProcessInvalidateTiles, 0); 234 } 235 236 ////////////////////////////////////////////////////////////////////////// 237 /// @brief Computes the number of primitives given the number of verts. 238 /// @param mode - primitive topology for draw operation. 239 /// @param numPrims - number of vertices or indices for draw. 240 /// @todo Frontend needs to be refactored. This will go in appropriate place then. 241 uint32_t GetNumPrims( 242 PRIMITIVE_TOPOLOGY mode, 243 uint32_t numPrims) 244 { 245 switch (mode) 246 { 247 case TOP_POINT_LIST: return numPrims; 248 case TOP_TRIANGLE_LIST: return numPrims / 3; 249 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2; 250 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2; 251 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1; 252 case TOP_QUAD_LIST: return numPrims / 4; 253 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2; 254 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1; 255 case TOP_LINE_LIST: return numPrims / 2; 256 case TOP_LINE_LOOP: return numPrims; 257 case TOP_RECT_LIST: return numPrims / 3; 258 case TOP_LINE_LIST_ADJ: return numPrims / 4; 259 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3; 260 case TOP_TRI_LIST_ADJ: return numPrims / 6; 261 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2; 262 263 case TOP_PATCHLIST_1: 264 case TOP_PATCHLIST_2: 265 case TOP_PATCHLIST_3: 266 case TOP_PATCHLIST_4: 267 case TOP_PATCHLIST_5: 268 case TOP_PATCHLIST_6: 269 case TOP_PATCHLIST_7: 270 case TOP_PATCHLIST_8: 271 case TOP_PATCHLIST_9: 272 case TOP_PATCHLIST_10: 273 case TOP_PATCHLIST_11: 274 case TOP_PATCHLIST_12: 275 case TOP_PATCHLIST_13: 276 case TOP_PATCHLIST_14: 277 case TOP_PATCHLIST_15: 278 case TOP_PATCHLIST_16: 279 case TOP_PATCHLIST_17: 280 case TOP_PATCHLIST_18: 281 case TOP_PATCHLIST_19: 282 case TOP_PATCHLIST_20: 283 case TOP_PATCHLIST_21: 284 case TOP_PATCHLIST_22: 285 case TOP_PATCHLIST_23: 286 case TOP_PATCHLIST_24: 287 case TOP_PATCHLIST_25: 288 case TOP_PATCHLIST_26: 289 case TOP_PATCHLIST_27: 290 case TOP_PATCHLIST_28: 291 case TOP_PATCHLIST_29: 292 case TOP_PATCHLIST_30: 293 case TOP_PATCHLIST_31: 294 case TOP_PATCHLIST_32: 295 return numPrims / (mode - TOP_PATCHLIST_BASE); 296 297 case TOP_POLYGON: 298 case TOP_POINT_LIST_BF: 299 case TOP_LINE_STRIP_CONT: 300 case TOP_LINE_STRIP_BF: 301 case TOP_LINE_STRIP_CONT_BF: 302 case TOP_TRIANGLE_FAN_NOSTIPPLE: 303 case TOP_TRI_STRIP_REVERSE: 304 case TOP_PATCHLIST_BASE: 305 case TOP_UNKNOWN: 306 SWR_INVALID("Unsupported topology: %d", mode); 307 return 0; 308 } 309 310 return 0; 311 } 312 313 ////////////////////////////////////////////////////////////////////////// 314 /// @brief Computes the number of verts given the number of primitives. 315 /// @param mode - primitive topology for draw operation. 316 /// @param numPrims - number of primitives for draw. 317 uint32_t GetNumVerts( 318 PRIMITIVE_TOPOLOGY mode, 319 uint32_t numPrims) 320 { 321 switch (mode) 322 { 323 case TOP_POINT_LIST: return numPrims; 324 case TOP_TRIANGLE_LIST: return numPrims * 3; 325 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0; 326 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0; 327 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0; 328 case TOP_QUAD_LIST: return numPrims * 4; 329 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0; 330 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0; 331 case TOP_LINE_LIST: return numPrims * 2; 332 case TOP_LINE_LOOP: return numPrims; 333 case TOP_RECT_LIST: return numPrims * 3; 334 case TOP_LINE_LIST_ADJ: return numPrims * 4; 335 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0; 336 case TOP_TRI_LIST_ADJ: return numPrims * 6; 337 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0; 338 339 case TOP_PATCHLIST_1: 340 case TOP_PATCHLIST_2: 341 case TOP_PATCHLIST_3: 342 case TOP_PATCHLIST_4: 343 case TOP_PATCHLIST_5: 344 case TOP_PATCHLIST_6: 345 case TOP_PATCHLIST_7: 346 case TOP_PATCHLIST_8: 347 case TOP_PATCHLIST_9: 348 case TOP_PATCHLIST_10: 349 case TOP_PATCHLIST_11: 350 case TOP_PATCHLIST_12: 351 case TOP_PATCHLIST_13: 352 case TOP_PATCHLIST_14: 353 case TOP_PATCHLIST_15: 354 case TOP_PATCHLIST_16: 355 case TOP_PATCHLIST_17: 356 case TOP_PATCHLIST_18: 357 case TOP_PATCHLIST_19: 358 case TOP_PATCHLIST_20: 359 case TOP_PATCHLIST_21: 360 case TOP_PATCHLIST_22: 361 case TOP_PATCHLIST_23: 362 case TOP_PATCHLIST_24: 363 case TOP_PATCHLIST_25: 364 case TOP_PATCHLIST_26: 365 case TOP_PATCHLIST_27: 366 case TOP_PATCHLIST_28: 367 case TOP_PATCHLIST_29: 368 case TOP_PATCHLIST_30: 369 case TOP_PATCHLIST_31: 370 case TOP_PATCHLIST_32: 371 return numPrims * (mode - TOP_PATCHLIST_BASE); 372 373 case TOP_POLYGON: 374 case TOP_POINT_LIST_BF: 375 case TOP_LINE_STRIP_CONT: 376 case TOP_LINE_STRIP_BF: 377 case TOP_LINE_STRIP_CONT_BF: 378 case TOP_TRIANGLE_FAN_NOSTIPPLE: 379 case TOP_TRI_STRIP_REVERSE: 380 case TOP_PATCHLIST_BASE: 381 case TOP_UNKNOWN: 382 SWR_INVALID("Unsupported topology: %d", mode); 383 return 0; 384 } 385 386 return 0; 387 } 388 389 ////////////////////////////////////////////////////////////////////////// 390 /// @brief Return number of verts per primitive. 391 /// @param topology - topology 392 /// @param includeAdjVerts - include adjacent verts in primitive vertices 393 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts) 394 { 395 uint32_t numVerts = 0; 396 switch (topology) 397 { 398 case TOP_POINT_LIST: 399 case TOP_POINT_LIST_BF: 400 numVerts = 1; 401 break; 402 case TOP_LINE_LIST: 403 case TOP_LINE_STRIP: 404 case TOP_LINE_LIST_ADJ: 405 case TOP_LINE_LOOP: 406 case TOP_LINE_STRIP_CONT: 407 case TOP_LINE_STRIP_BF: 408 case TOP_LISTSTRIP_ADJ: 409 numVerts = 2; 410 break; 411 case TOP_TRIANGLE_LIST: 412 case TOP_TRIANGLE_STRIP: 413 case TOP_TRIANGLE_FAN: 414 case TOP_TRI_LIST_ADJ: 415 case TOP_TRI_STRIP_ADJ: 416 case TOP_TRI_STRIP_REVERSE: 417 case TOP_RECT_LIST: 418 numVerts = 3; 419 break; 420 case TOP_QUAD_LIST: 421 case TOP_QUAD_STRIP: 422 numVerts = 4; 423 break; 424 case TOP_PATCHLIST_1: 425 case TOP_PATCHLIST_2: 426 case TOP_PATCHLIST_3: 427 case TOP_PATCHLIST_4: 428 case TOP_PATCHLIST_5: 429 case TOP_PATCHLIST_6: 430 case TOP_PATCHLIST_7: 431 case TOP_PATCHLIST_8: 432 case TOP_PATCHLIST_9: 433 case TOP_PATCHLIST_10: 434 case TOP_PATCHLIST_11: 435 case TOP_PATCHLIST_12: 436 case TOP_PATCHLIST_13: 437 case TOP_PATCHLIST_14: 438 case TOP_PATCHLIST_15: 439 case TOP_PATCHLIST_16: 440 case TOP_PATCHLIST_17: 441 case TOP_PATCHLIST_18: 442 case TOP_PATCHLIST_19: 443 case TOP_PATCHLIST_20: 444 case TOP_PATCHLIST_21: 445 case TOP_PATCHLIST_22: 446 case TOP_PATCHLIST_23: 447 case TOP_PATCHLIST_24: 448 case TOP_PATCHLIST_25: 449 case TOP_PATCHLIST_26: 450 case TOP_PATCHLIST_27: 451 case TOP_PATCHLIST_28: 452 case TOP_PATCHLIST_29: 453 case TOP_PATCHLIST_30: 454 case TOP_PATCHLIST_31: 455 case TOP_PATCHLIST_32: 456 numVerts = topology - TOP_PATCHLIST_BASE; 457 break; 458 default: 459 SWR_INVALID("Unsupported topology: %d", topology); 460 break; 461 } 462 463 if (includeAdjVerts) 464 { 465 switch (topology) 466 { 467 case TOP_LISTSTRIP_ADJ: 468 case TOP_LINE_LIST_ADJ: numVerts = 4; break; 469 case TOP_TRI_STRIP_ADJ: 470 case TOP_TRI_LIST_ADJ: numVerts = 6; break; 471 default: break; 472 } 473 } 474 475 return numVerts; 476 } 477 478 ////////////////////////////////////////////////////////////////////////// 479 /// @brief Generate mask from remaining work. 480 /// @param numWorkItems - Number of items being worked on by a SIMD. 481 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) 482 { 483 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining; 484 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; 485 return _simd_castps_si(_simd_vmask_ps(mask)); 486 } 487 488 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining) 489 { 490 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining; 491 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; 492 return _simd16_castps_si(_simd16_vmask_ps(mask)); 493 } 494 495 ////////////////////////////////////////////////////////////////////////// 496 /// @brief StreamOut - Streams vertex data out to SO buffers. 497 /// Generally, we are only streaming out a SIMDs worth of triangles. 498 /// @param pDC - pointer to draw context. 499 /// @param workerId - thread's worker id. Even thread has a unique id. 500 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris) 501 static void StreamOut( 502 DRAW_CONTEXT* pDC, 503 PA_STATE& pa, 504 uint32_t workerId, 505 uint32_t* pPrimData, 506 uint32_t streamIndex) 507 { 508 SWR_CONTEXT *pContext = pDC->pContext; 509 510 AR_BEGIN(FEStreamout, pDC->drawId); 511 512 const API_STATE& state = GetApiState(pDC); 513 const SWR_STREAMOUT_STATE &soState = state.soState; 514 515 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); 516 517 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex. 518 uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t); 519 520 SWR_STREAMOUT_CONTEXT soContext = { 0 }; 521 522 // Setup buffer state pointers. 523 for (uint32_t i = 0; i < 4; ++i) 524 { 525 soContext.pBuffer[i] = &state.soBuffer[i]; 526 } 527 528 uint32_t numPrims = pa.NumPrims(); 529 530 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) 531 { 532 DWORD slot = 0; 533 uint32_t soMask = soState.streamMasks[streamIndex]; 534 535 // Write all entries into primitive data buffer for SOS. 536 while (_BitScanForward(&slot, soMask)) 537 { 538 simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide) 539 uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex]; 540 pa.AssembleSingle(paSlot, primIndex, attrib); 541 542 // Attribute offset is relative offset from start of vertex. 543 // Note that attributes start at slot 1 in the PA buffer. We need to write this 544 // to prim data starting at slot 0. Which is why we do (slot - 1). 545 // Also note: GL works slightly differently, and needs slot 0 546 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t); 547 548 // Store each vertex's attrib at appropriate locations in pPrimData buffer. 549 for (uint32_t v = 0; v < soVertsPerPrim; ++v) 550 { 551 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); 552 553 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); 554 } 555 556 soMask &= ~(1 << slot); 557 } 558 559 // Update pPrimData pointer 560 soContext.pPrimData = pPrimData; 561 562 // Call SOS 563 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function."); 564 state.pfnSoFunc[streamIndex](soContext); 565 } 566 567 // Update SO write offset. The driver provides memory for the update. 568 for (uint32_t i = 0; i < 4; ++i) 569 { 570 if (state.soBuffer[i].pWriteOffset) 571 { 572 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); 573 } 574 575 if (state.soBuffer[i].soWriteEnable) 576 { 577 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); 578 pDC->dynState.SoWriteOffsetDirty[i] = true; 579 } 580 } 581 582 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); 583 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); 584 585 AR_END(FEStreamout, 1); 586 } 587 588 #if USE_SIMD16_FRONTEND 589 ////////////////////////////////////////////////////////////////////////// 590 /// Is value an even number (a multiple of two) 591 /// 592 template <typename T> 593 INLINE static bool IsEven(T value) 594 { 595 return (value & 1) == 0; 596 } 597 598 ////////////////////////////////////////////////////////////////////////// 599 /// Round up value to an even number (a multiple of two) 600 /// 601 template <typename T> 602 INLINE static T RoundUpEven(T value) 603 { 604 return (value + 1) & ~1; 605 } 606 607 ////////////////////////////////////////////////////////////////////////// 608 /// Round down value to an even number (a multiple of two) 609 /// 610 template <typename T> 611 INLINE static T RoundDownEven(T value) 612 { 613 return value & ~1; 614 } 615 616 ////////////////////////////////////////////////////////////////////////// 617 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping 618 /// 619 /// vertexCount is in terms of the source simdvertexes and must be even 620 /// 621 /// attribCount will limit the vector copies to those attribs specified 622 /// 623 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS 624 /// 625 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount) 626 { 627 SWR_ASSERT(vertex); 628 SWR_ASSERT(vertex_simd16); 629 SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS); 630 631 simd16vertex temp; 632 633 for (uint32_t i = 0; i < vertexCount; i += 2) 634 { 635 for (uint32_t j = 0; j < attribCount; j += 1) 636 { 637 for (uint32_t k = 0; k < 4; k += 1) 638 { 639 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0); 640 641 if ((i + 1) < vertexCount) 642 { 643 temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1); 644 } 645 } 646 } 647 648 for (uint32_t j = 0; j < attribCount; j += 1) 649 { 650 vertex_simd16[i >> 1].attrib[j] = temp.attrib[j]; 651 } 652 } 653 } 654 655 #endif 656 ////////////////////////////////////////////////////////////////////////// 657 /// @brief Computes number of invocations. The current index represents 658 /// the start of the SIMD. The max index represents how much work 659 /// items are remaining. If there is less then a SIMD's xmin of work 660 /// then return the remaining amount of work. 661 /// @param curIndex - The start index for the SIMD. 662 /// @param maxIndex - The last index for all work items. 663 static INLINE uint32_t GetNumInvocations( 664 uint32_t curIndex, 665 uint32_t maxIndex) 666 { 667 uint32_t remainder = (maxIndex - curIndex); 668 #if USE_SIMD16_FRONTEND 669 return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder; 670 #else 671 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder; 672 #endif 673 } 674 675 ////////////////////////////////////////////////////////////////////////// 676 /// @brief Converts a streamId buffer to a cut buffer for the given stream id. 677 /// The geometry shader will loop over each active streamout buffer, assembling 678 /// primitives for the downstream stages. When multistream output is enabled, 679 /// the generated stream ID buffer from the GS needs to be converted to a cut 680 /// buffer for the primitive assembler. 681 /// @param stream - stream id to generate the cut buffer for 682 /// @param pStreamIdBase - pointer to the stream ID buffer 683 /// @param numEmittedVerts - Number of total verts emitted by the GS 684 /// @param pCutBuffer - output buffer to write cuts to 685 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer) 686 { 687 SWR_ASSERT(stream < MAX_SO_STREAMS); 688 689 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8; 690 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U); 691 692 for (uint32_t b = 0; b < numOutputBytes; ++b) 693 { 694 uint8_t curInputByte = pStreamIdBase[2*b]; 695 uint8_t outByte = 0; 696 for (uint32_t i = 0; i < 4; ++i) 697 { 698 if ((curInputByte & 0x3) != stream) 699 { 700 outByte |= (1 << i); 701 } 702 curInputByte >>= 2; 703 } 704 705 curInputByte = pStreamIdBase[2 * b + 1]; 706 for (uint32_t i = 0; i < 4; ++i) 707 { 708 if ((curInputByte & 0x3) != stream) 709 { 710 outByte |= (1 << (i + 4)); 711 } 712 curInputByte >>= 2; 713 } 714 715 *pCutBuffer++ = outByte; 716 } 717 } 718 719 // Buffers that are allocated if GS is enabled 720 struct GsBuffers 721 { 722 uint8_t* pGsIn; 723 uint8_t* pGsOut[KNOB_SIMD_WIDTH]; 724 uint8_t* pGsTransposed; 725 void* pStreamCutBuffer; 726 }; 727 728 ////////////////////////////////////////////////////////////////////////// 729 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler 730 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler 731 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader 732 /// @param numVerts - Number of vertices outputted by the GS 733 /// @param numAttribs - Number of attributes per vertex 734 template<typename SIMD_T, uint32_t SimdWidth> 735 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) 736 { 737 uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; 738 uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4; 739 740 OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; 741 742 for (uint32_t i = 0; i < SimdWidth; ++i) 743 { 744 gatherOffsets[i] = srcVertexStride * i; 745 } 746 auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]); 747 748 uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; 749 uint32_t remainingVerts = numVerts; 750 751 for (uint32_t s = 0; s < numSimd; ++s) 752 { 753 uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; 754 uint8_t* pDstBase = pDst + s * dstVertexStride; 755 756 // Compute mask to prevent src overflow 757 uint32_t mask = std::min(remainingVerts, SimdWidth); 758 mask = GenMask(mask); 759 auto vMask = SIMD_T::vmask_ps(mask); 760 auto viMask = SIMD_T::castps_si(vMask); 761 762 for (uint32_t a = 0; a < numAttribs; ++a) 763 { 764 auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); 765 auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask); 766 auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask); 767 auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask); 768 769 SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); 770 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY); 771 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ); 772 SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW); 773 774 pSrcBase += sizeof(float) * 4; 775 pDstBase += sizeof(typename SIMD_T::Float) * 4; 776 } 777 remainingVerts -= SimdWidth; 778 } 779 } 780 781 782 ////////////////////////////////////////////////////////////////////////// 783 /// @brief Implements GS stage. 784 /// @param pDC - pointer to draw context. 785 /// @param workerId - thread's worker id. Even thread has a unique id. 786 /// @param pa - The primitive assembly object. 787 /// @param pGsOut - output stream for GS 788 template < 789 typename HasStreamOutT, 790 typename HasRastT> 791 static void GeometryShaderStage( 792 DRAW_CONTEXT *pDC, 793 uint32_t workerId, 794 PA_STATE& pa, 795 GsBuffers* pGsBuffers, 796 uint32_t* pSoPrimData, 797 #if USE_SIMD16_FRONTEND 798 uint32_t numPrims_simd8, 799 #endif 800 simdscalari const &primID) 801 { 802 SWR_CONTEXT *pContext = pDC->pContext; 803 804 AR_BEGIN(FEGeometryShader, pDC->drawId); 805 806 const API_STATE& state = GetApiState(pDC); 807 const SWR_GS_STATE* pState = &state.gsState; 808 SWR_GS_CONTEXT gsContext; 809 810 static uint8_t sNullBuffer[128] = { 0 }; 811 812 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) 813 { 814 gsContext.pStreams[i] = pGsBuffers->pGsOut[i]; 815 } 816 gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn; 817 gsContext.PrimitiveID = primID; 818 819 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); 820 simdvector attrib[MAX_NUM_VERTS_PER_PRIM]; 821 822 // assemble all attributes for the input primitive 823 gsContext.inputVertStride = pState->inputVertStride; 824 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) 825 { 826 uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot; 827 uint32_t attribSlot = pState->vertexAttribOffset + slot; 828 pa.Assemble(srcAttribSlot, attrib); 829 830 for (uint32_t i = 0; i < numVertsPerPrim; ++i) 831 { 832 gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i]; 833 } 834 } 835 836 // assemble position 837 pa.Assemble(VERTEX_POSITION_SLOT, attrib); 838 for (uint32_t i = 0; i < numVertsPerPrim; ++i) 839 { 840 gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i]; 841 } 842 843 // record valid prims from the frontend to avoid over binning the newly generated 844 // prims from the GS 845 #if USE_SIMD16_FRONTEND 846 uint32_t numInputPrims = numPrims_simd8; 847 #else 848 uint32_t numInputPrims = pa.NumPrims(); 849 #endif 850 851 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) 852 { 853 gsContext.InstanceID = instance; 854 gsContext.mask = GenerateMask(numInputPrims); 855 856 // execute the geometry shader 857 state.pfnGsFunc(GetPrivateState(pDC), &gsContext); 858 859 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) 860 { 861 gsContext.pStreams[i] += pState->allocationSize; 862 } 863 } 864 865 // set up new binner and state for the GS output topology 866 #if USE_SIMD16_FRONTEND 867 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr; 868 if (HasRastT::value) 869 { 870 switch (pState->outputTopology) 871 { 872 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break; 873 case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break; 874 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break; 875 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); 876 } 877 } 878 879 #else 880 PFN_PROCESS_PRIMS pfnClipFunc = nullptr; 881 if (HasRastT::value) 882 { 883 switch (pState->outputTopology) 884 { 885 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break; 886 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break; 887 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; 888 default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); 889 } 890 } 891 892 #endif 893 // foreach input prim: 894 // - setup a new PA based on the emitted verts for that prim 895 // - loop over the new verts, calling PA to assemble each prim 896 uint32_t* pPrimitiveId = (uint32_t*)&primID; 897 898 uint32_t totalPrimsGenerated = 0; 899 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) 900 { 901 uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim]; 902 903 // Vertex count is either emitted by shader or static 904 uint32_t vertexCount = 0; 905 if (pState->staticVertexCount) 906 { 907 vertexCount = pState->staticVertexCount; 908 } 909 else 910 { 911 // If emitted in shader, it should be the stored in the first dword of the output buffer 912 vertexCount = *(uint32_t*)pInstanceBase; 913 } 914 915 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) 916 { 917 uint32_t numEmittedVerts = vertexCount; 918 if (numEmittedVerts == 0) 919 { 920 continue; 921 } 922 923 uint8_t* pBase = pInstanceBase + instance * pState->allocationSize; 924 uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset; 925 uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset; 926 927 #if USE_SIMD16_FRONTEND 928 TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize); 929 #else 930 TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize); 931 #endif 932 933 uint32_t numAttribs = state.feNumAttributes; 934 935 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) 936 { 937 bool processCutVerts = false; 938 uint8_t* pCutBuffer = pCutBase; 939 940 // assign default stream ID, only relevant when GS is outputting a single stream 941 uint32_t streamID = 0; 942 if (pState->isSingleStream) 943 { 944 processCutVerts = true; 945 streamID = pState->singleStreamID; 946 if (streamID != stream) continue; 947 } 948 else 949 { 950 // early exit if this stream is not enabled for streamout 951 if (HasStreamOutT::value && !state.soState.streamEnable[stream]) 952 { 953 continue; 954 } 955 956 // multi-stream output, need to translate StreamID buffer to a cut buffer 957 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer); 958 pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer; 959 processCutVerts = false; 960 } 961 962 #if USE_SIMD16_FRONTEND 963 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim); 964 965 #else 966 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim); 967 968 #endif 969 while (gsPa.GetNextStreamOutput()) 970 { 971 do 972 { 973 #if USE_SIMD16_FRONTEND 974 simd16vector attrib_simd16[3]; 975 976 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16); 977 978 #else 979 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib); 980 981 #endif 982 if (assemble) 983 { 984 totalPrimsGenerated += gsPa.NumPrims(); 985 986 if (HasStreamOutT::value) 987 { 988 #if ENABLE_AVX512_SIMD16 989 gsPa.useAlternateOffset = false; 990 #endif 991 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream); 992 } 993 994 if (HasRastT::value && state.soState.streamToRasterizer == stream) 995 { 996 #if USE_SIMD16_FRONTEND 997 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]); 998 999 // Gather data from the SVG if provided. 1000 simd16scalari vViewportIdx = SIMD16::setzero_si(); 1001 simd16scalari vRtIdx = SIMD16::setzero_si(); 1002 SIMD16::Vec4 svgAttrib[4]; 1003 1004 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) 1005 { 1006 gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); 1007 } 1008 1009 1010 if (state.backendState.readViewportArrayIndex) 1011 { 1012 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); 1013 gsPa.viewportArrayActive = true; 1014 } 1015 if (state.backendState.readRenderTargetArrayIndex) 1016 { 1017 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); 1018 gsPa.rtArrayActive = true; 1019 } 1020 1021 { 1022 // OOB VPAI indices => forced to zero. 1023 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si()); 1024 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); 1025 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports); 1026 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx); 1027 1028 gsPa.useAlternateOffset = false; 1029 pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx); 1030 } 1031 #else 1032 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); 1033 1034 // Gather data from the SVG if provided. 1035 simdscalari vViewportIdx = SIMD16::setzero_si(); 1036 simdscalari vRtIdx = SIMD16::setzero_si(); 1037 SIMD8::Vec4 svgAttrib[4]; 1038 1039 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) 1040 { 1041 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); 1042 } 1043 1044 1045 if (state.backendState.readViewportArrayIndex) 1046 { 1047 vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); 1048 1049 // OOB VPAI indices => forced to zero. 1050 vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si()); 1051 simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); 1052 simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports); 1053 vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx); 1054 tessPa.viewportArrayActive = true; 1055 } 1056 if (state.backendState.readRenderTargetArrayIndex) 1057 { 1058 vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); 1059 tessPa.rtArrayActive = true; 1060 } 1061 1062 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx); 1063 #endif 1064 } 1065 } 1066 } while (gsPa.NextPrim()); 1067 } 1068 } 1069 } 1070 } 1071 1072 // update GS pipeline stats 1073 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount); 1074 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated); 1075 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims)); 1076 AR_END(FEGeometryShader, 1); 1077 } 1078 1079 ////////////////////////////////////////////////////////////////////////// 1080 /// @brief Allocate GS buffers 1081 /// @param pDC - pointer to draw context. 1082 /// @param state - API state 1083 /// @param ppGsOut - pointer to GS output buffer allocation 1084 /// @param ppCutBuffer - pointer to GS output cut buffer allocation 1085 template<typename SIMD_T, uint32_t SIMD_WIDTH> 1086 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers) 1087 { 1088 auto pArena = pDC->pArena; 1089 SWR_ASSERT(pArena != nullptr); 1090 SWR_ASSERT(state.gsState.gsEnable); 1091 1092 const SWR_GS_STATE& gsState = state.gsState; 1093 1094 // Allocate storage for vertex inputs 1095 uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim; 1096 pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32); 1097 1098 // Allocate arena space to hold GS output verts 1099 const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize; 1100 1101 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) 1102 { 1103 pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32); 1104 } 1105 1106 // Allocate storage for transposed GS output 1107 uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH; 1108 uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4); 1109 pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32); 1110 1111 // Allocate storage to hold temporary stream->cut buffer, if necessary 1112 if (state.gsState.isSingleStream) 1113 { 1114 pGsBuffers->pStreamCutBuffer = nullptr; 1115 } 1116 else 1117 { 1118 pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32); 1119 } 1120 } 1121 1122 ////////////////////////////////////////////////////////////////////////// 1123 /// @brief Contains all data generated by the HS and passed to the 1124 /// tessellator and DS. 1125 struct TessellationThreadLocalData 1126 { 1127 SWR_HS_CONTEXT hsContext; 1128 ScalarPatch patchData[KNOB_SIMD_WIDTH]; 1129 void* pTxCtx; 1130 size_t tsCtxSize; 1131 1132 simdscalar* pDSOutput; 1133 size_t dsOutputAllocSize; 1134 }; 1135 1136 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; 1137 1138 ////////////////////////////////////////////////////////////////////////// 1139 /// @brief Allocate tessellation data for this worker thread. 1140 INLINE 1141 static void AllocateTessellationData(SWR_CONTEXT* pContext) 1142 { 1143 /// @TODO - Don't use thread local storage. Use Worker local storage instead. 1144 if (gt_pTessellationThreadData == nullptr) 1145 { 1146 gt_pTessellationThreadData = (TessellationThreadLocalData*) 1147 AlignedMalloc(sizeof(TessellationThreadLocalData), 64); 1148 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData)); 1149 } 1150 } 1151 1152 ////////////////////////////////////////////////////////////////////////// 1153 /// @brief Implements Tessellation Stages. 1154 /// @param pDC - pointer to draw context. 1155 /// @param workerId - thread's worker id. Even thread has a unique id. 1156 /// @param pa - The primitive assembly object. 1157 /// @param pGsOut - output stream for GS 1158 template < 1159 typename HasGeometryShaderT, 1160 typename HasStreamOutT, 1161 typename HasRastT> 1162 static void TessellationStages( 1163 DRAW_CONTEXT *pDC, 1164 uint32_t workerId, 1165 PA_STATE& pa, 1166 GsBuffers* pGsBuffers, 1167 uint32_t* pSoPrimData, 1168 #if USE_SIMD16_FRONTEND 1169 uint32_t numPrims_simd8, 1170 #endif 1171 simdscalari const &primID) 1172 { 1173 SWR_CONTEXT *pContext = pDC->pContext; 1174 const API_STATE& state = GetApiState(pDC); 1175 const SWR_TS_STATE& tsState = state.tsState; 1176 1177 SWR_ASSERT(gt_pTessellationThreadData); 1178 1179 HANDLE tsCtx = TSInitCtx( 1180 tsState.domain, 1181 tsState.partitioning, 1182 tsState.tsOutputTopology, 1183 gt_pTessellationThreadData->pTxCtx, 1184 gt_pTessellationThreadData->tsCtxSize); 1185 if (tsCtx == nullptr) 1186 { 1187 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64); 1188 tsCtx = TSInitCtx( 1189 tsState.domain, 1190 tsState.partitioning, 1191 tsState.tsOutputTopology, 1192 gt_pTessellationThreadData->pTxCtx, 1193 gt_pTessellationThreadData->tsCtxSize); 1194 } 1195 SWR_ASSERT(tsCtx); 1196 1197 #if USE_SIMD16_FRONTEND 1198 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr; 1199 if (HasRastT::value) 1200 { 1201 switch (tsState.postDSTopology) 1202 { 1203 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break; 1204 case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break; 1205 case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break; 1206 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); 1207 } 1208 } 1209 1210 #else 1211 PFN_PROCESS_PRIMS pfnClipFunc = nullptr; 1212 if (HasRastT::value) 1213 { 1214 switch (tsState.postDSTopology) 1215 { 1216 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break; 1217 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break; 1218 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; 1219 default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); 1220 } 1221 } 1222 1223 #endif 1224 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext; 1225 hsContext.pCPout = gt_pTessellationThreadData->patchData; 1226 hsContext.PrimitiveID = primID; 1227 1228 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); 1229 // Max storage for one attribute for an entire simdprimitive 1230 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM]; 1231 1232 // assemble all attributes for the input primitives 1233 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) 1234 { 1235 uint32_t attribSlot = tsState.vertexAttribOffset + slot; 1236 pa.Assemble(attribSlot, simdattrib); 1237 1238 for (uint32_t i = 0; i < numVertsPerPrim; ++i) 1239 { 1240 hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i]; 1241 } 1242 } 1243 1244 #if defined(_DEBUG) 1245 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); 1246 #endif 1247 1248 #if USE_SIMD16_FRONTEND 1249 uint32_t numPrims = numPrims_simd8; 1250 #else 1251 uint32_t numPrims = pa.NumPrims(); 1252 #endif 1253 hsContext.mask = GenerateMask(numPrims); 1254 1255 // Run the HS 1256 AR_BEGIN(FEHullShader, pDC->drawId); 1257 state.pfnHsFunc(GetPrivateState(pDC), &hsContext); 1258 AR_END(FEHullShader, 0); 1259 1260 UPDATE_STAT_FE(HsInvocations, numPrims); 1261 1262 const uint32_t* pPrimId = (const uint32_t*)&primID; 1263 1264 for (uint32_t p = 0; p < numPrims; ++p) 1265 { 1266 // Run Tessellator 1267 SWR_TS_TESSELLATED_DATA tsData = { 0 }; 1268 AR_BEGIN(FETessellation, pDC->drawId); 1269 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); 1270 AR_EVENT(TessPrimCount(1)); 1271 AR_END(FETessellation, 0); 1272 1273 if (tsData.NumPrimitives == 0) 1274 { 1275 continue; 1276 } 1277 SWR_ASSERT(tsData.NumDomainPoints); 1278 1279 // Allocate DS Output memory 1280 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; 1281 #if USE_SIMD16_FRONTEND 1282 size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize; // simd8 -> simd16, padding 1283 #else 1284 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize; 1285 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; 1286 #endif 1287 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize) 1288 { 1289 AlignedFree(gt_pTessellationThreadData->pDSOutput); 1290 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64); 1291 gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize; 1292 } 1293 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); 1294 SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize); 1295 1296 #if defined(_DEBUG) 1297 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize); 1298 #endif 1299 1300 // Run Domain Shader 1301 SWR_DS_CONTEXT dsContext; 1302 dsContext.PrimitiveID = pPrimId[p]; 1303 dsContext.pCpIn = &hsContext.pCPout[p]; 1304 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; 1305 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; 1306 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; 1307 dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset; 1308 #if USE_SIMD16_FRONTEND 1309 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16 1310 #else 1311 dsContext.vectorStride = requiredDSVectorInvocations; 1312 #endif 1313 1314 uint32_t dsInvocations = 0; 1315 1316 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset) 1317 { 1318 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations); 1319 1320 AR_BEGIN(FEDomainShader, pDC->drawId); 1321 state.pfnDsFunc(GetPrivateState(pDC), &dsContext); 1322 AR_END(FEDomainShader, 0); 1323 1324 dsInvocations += KNOB_SIMD_WIDTH; 1325 } 1326 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints); 1327 1328 #if USE_SIMD16_FRONTEND 1329 SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16 1330 1331 #endif 1332 PA_TESS tessPa( 1333 pDC, 1334 #if USE_SIMD16_FRONTEND 1335 reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16 1336 dsContext.vectorStride / 2, // simd8 -> simd16 1337 #else 1338 dsContext.pOutputData, 1339 dsContext.vectorStride, 1340 #endif 1341 SWR_VTX_NUM_SLOTS, 1342 tsState.numDsOutputAttribs, 1343 tsData.ppIndices, 1344 tsData.NumPrimitives, 1345 tsState.postDSTopology, 1346 numVertsPerPrim); 1347 1348 while (tessPa.HasWork()) 1349 { 1350 #if USE_SIMD16_FRONTEND 1351 const uint32_t numPrims = tessPa.NumPrims(); 1352 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); 1353 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; 1354 1355 const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID); 1356 const simdscalari primID_lo = _simd16_extract_si(primID, 0); 1357 const simdscalari primID_hi = _simd16_extract_si(primID, 1); 1358 1359 #endif 1360 if (HasGeometryShaderT::value) 1361 { 1362 #if USE_SIMD16_FRONTEND 1363 tessPa.useAlternateOffset = false; 1364 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo); 1365 1366 if (numPrims_hi) 1367 { 1368 tessPa.useAlternateOffset = true; 1369 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi); 1370 } 1371 #else 1372 GeometryShaderStage<HasStreamOutT, HasRastT>( 1373 pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID)); 1374 #endif 1375 } 1376 else 1377 { 1378 if (HasStreamOutT::value) 1379 { 1380 #if ENABLE_AVX512_SIMD16 1381 tessPa.useAlternateOffset = false; 1382 #endif 1383 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0); 1384 } 1385 1386 if (HasRastT::value) 1387 { 1388 #if USE_SIMD16_FRONTEND 1389 simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points 1390 #else 1391 simdvector prim[3]; // Only deal with triangles, lines, or points 1392 #endif 1393 AR_BEGIN(FEPAAssemble, pDC->drawId); 1394 bool assemble = 1395 #if USE_SIMD16_FRONTEND 1396 tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16); 1397 #else 1398 tessPa.Assemble(VERTEX_POSITION_SLOT, prim); 1399 #endif 1400 AR_END(FEPAAssemble, 1); 1401 SWR_ASSERT(assemble); 1402 1403 SWR_ASSERT(pfnClipFunc); 1404 #if USE_SIMD16_FRONTEND 1405 // Gather data from the SVG if provided. 1406 simd16scalari vViewportIdx = SIMD16::setzero_si(); 1407 simd16scalari vRtIdx = SIMD16::setzero_si(); 1408 SIMD16::Vec4 svgAttrib[4]; 1409 1410 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) 1411 { 1412 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); 1413 } 1414 1415 1416 if (state.backendState.readViewportArrayIndex) 1417 { 1418 vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); 1419 tessPa.viewportArrayActive = true; 1420 } 1421 if (state.backendState.readRenderTargetArrayIndex) 1422 { 1423 vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); 1424 tessPa.rtArrayActive = true; 1425 } 1426 1427 1428 { 1429 // OOB VPAI indices => forced to zero. 1430 vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si()); 1431 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); 1432 simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports); 1433 vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx); 1434 1435 tessPa.useAlternateOffset = false; 1436 pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, vViewportIdx, vRtIdx); 1437 } 1438 #else 1439 // Gather data from the SVG if provided. 1440 simdscalari vViewportIdx = SIMD16::setzero_si(); 1441 simdscalari vRtIdx = SIMD16::setzero_si(); 1442 SIMD8::Vec4 svgAttrib[4]; 1443 1444 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) 1445 { 1446 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); 1447 } 1448 1449 if (state.backendState.readViewportArrayIndex) 1450 { 1451 vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); 1452 1453 // OOB VPAI indices => forced to zero. 1454 vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si()); 1455 simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); 1456 simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports); 1457 vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx); 1458 tessPa.viewportArrayActive = true; 1459 } 1460 if (state.backendState.readRenderTargetArrayIndex) 1461 { 1462 vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); 1463 tessPa.rtArrayActive = true; 1464 } 1465 pfnClipFunc(pDC, tessPa, workerId, prim, 1466 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), vViewportIdx, vRtIdx); 1467 #endif 1468 } 1469 } 1470 1471 tessPa.NextPrim(); 1472 1473 } // while (tessPa.HasWork()) 1474 } // for (uint32_t p = 0; p < numPrims; ++p) 1475 1476 #if USE_SIMD16_FRONTEND 1477 if (gt_pTessellationThreadData->pDSOutput != nullptr) 1478 { 1479 AlignedFree(gt_pTessellationThreadData->pDSOutput); 1480 gt_pTessellationThreadData->pDSOutput = nullptr; 1481 } 1482 gt_pTessellationThreadData->dsOutputAllocSize = 0; 1483 1484 #endif 1485 TSDestroyCtx(tsCtx); 1486 } 1487 1488 THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr; 1489 THREAD uint32_t gVertexStoreSize = 0; 1490 1491 ////////////////////////////////////////////////////////////////////////// 1492 /// @brief FE handler for SwrDraw. 1493 /// @tparam IsIndexedT - Is indexed drawing enabled 1494 /// @tparam HasTessellationT - Is tessellation enabled 1495 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled 1496 /// @tparam HasStreamOutT - Is stream-out enabled 1497 /// @tparam HasRastT - Is rasterization enabled 1498 /// @param pContext - pointer to SWR context. 1499 /// @param pDC - pointer to draw context. 1500 /// @param workerId - thread's worker id. 1501 /// @param pUserData - Pointer to DRAW_WORK 1502 template < 1503 typename IsIndexedT, 1504 typename IsCutIndexEnabledT, 1505 typename HasTessellationT, 1506 typename HasGeometryShaderT, 1507 typename HasStreamOutT, 1508 typename HasRastT> 1509 void ProcessDraw( 1510 SWR_CONTEXT *pContext, 1511 DRAW_CONTEXT *pDC, 1512 uint32_t workerId, 1513 void *pUserData) 1514 { 1515 1516 #if KNOB_ENABLE_TOSS_POINTS 1517 if (KNOB_TOSS_QUEUE_FE) 1518 { 1519 return; 1520 } 1521 #endif 1522 1523 AR_BEGIN(FEProcessDraw, pDC->drawId); 1524 1525 DRAW_WORK& work = *(DRAW_WORK*)pUserData; 1526 const API_STATE& state = GetApiState(pDC); 1527 1528 uint32_t indexSize = 0; 1529 uint32_t endVertex = work.numVerts; 1530 1531 const int32_t* pLastRequestedIndex = nullptr; 1532 if (IsIndexedT::value) 1533 { 1534 switch (work.type) 1535 { 1536 case R32_UINT: 1537 indexSize = sizeof(uint32_t); 1538 pLastRequestedIndex = &(work.pIB[endVertex]); 1539 break; 1540 case R16_UINT: 1541 indexSize = sizeof(uint16_t); 1542 // nasty address offset to last index 1543 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex])); 1544 break; 1545 case R8_UINT: 1546 indexSize = sizeof(uint8_t); 1547 // nasty address offset to last index 1548 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex])); 1549 break; 1550 default: 1551 SWR_INVALID("Invalid work.type: %d", work.type); 1552 } 1553 } 1554 else 1555 { 1556 // No cuts, prune partial primitives. 1557 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts)); 1558 } 1559 1560 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR) 1561 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); 1562 #endif 1563 1564 GsBuffers gsBuffers; 1565 if (HasGeometryShaderT::value) 1566 { 1567 #if USE_SIMD16_FRONTEND 1568 AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); 1569 #else 1570 AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); 1571 #endif 1572 } 1573 1574 if (HasTessellationT::value) 1575 { 1576 SWR_ASSERT(state.tsState.tsEnable == true); 1577 SWR_ASSERT(state.pfnHsFunc != nullptr); 1578 SWR_ASSERT(state.pfnDsFunc != nullptr); 1579 1580 AllocateTessellationData(pContext); 1581 } 1582 else 1583 { 1584 SWR_ASSERT(state.tsState.tsEnable == false); 1585 SWR_ASSERT(state.pfnHsFunc == nullptr); 1586 SWR_ASSERT(state.pfnDsFunc == nullptr); 1587 } 1588 1589 // allocate space for streamout input prim data 1590 uint32_t* pSoPrimData = nullptr; 1591 if (HasStreamOutT::value) 1592 { 1593 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16); 1594 } 1595 1596 const uint32_t vertexCount = NumVertsPerPrim(state.topology, true); 1597 #if USE_SIMD16_FRONTEND 1598 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector); 1599 #else 1600 uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector); 1601 #endif 1602 1603 SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM); 1604 1605 // Compute storage requirements for vertex store 1606 // TODO: allocation needs to be rethought for better cut support 1607 uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine 1608 uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes; 1609 1610 // grow the vertex store for the PA as necessary 1611 if (gVertexStoreSize < vertexStoreSize) 1612 { 1613 if (gpVertexStore != nullptr) 1614 { 1615 AlignedFree(gpVertexStore); 1616 gpVertexStore = nullptr; 1617 } 1618 1619 SWR_ASSERT(gpVertexStore == nullptr); 1620 1621 gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64)); 1622 gVertexStoreSize = vertexStoreSize; 1623 1624 SWR_ASSERT(gpVertexStore != nullptr); 1625 } 1626 1627 // choose primitive assembler 1628 1629 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1)); 1630 PA_STATE& pa = paFactory.GetPA(); 1631 1632 #if USE_SIMD16_FRONTEND 1633 #if USE_SIMD16_SHADERS 1634 simd16vertex vin; 1635 #else 1636 simdvertex vin_lo; 1637 simdvertex vin_hi; 1638 #endif 1639 SWR_VS_CONTEXT vsContext_lo; 1640 SWR_VS_CONTEXT vsContext_hi; 1641 1642 #if USE_SIMD16_SHADERS 1643 vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin); 1644 vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin); 1645 #else 1646 vsContext_lo.pVin = &vin_lo; 1647 vsContext_hi.pVin = &vin_hi; 1648 #endif 1649 vsContext_lo.AlternateOffset = 0; 1650 vsContext_hi.AlternateOffset = 1; 1651 1652 SWR_FETCH_CONTEXT fetchInfo_lo = { 0 }; 1653 1654 fetchInfo_lo.pStreams = &state.vertexBuffers[0]; 1655 fetchInfo_lo.StartInstance = work.startInstance; 1656 fetchInfo_lo.StartVertex = 0; 1657 1658 if (IsIndexedT::value) 1659 { 1660 fetchInfo_lo.BaseVertex = work.baseVertex; 1661 1662 // if the entire index buffer isn't being consumed, set the last index 1663 // so that fetches < a SIMD wide will be masked off 1664 fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); 1665 if (pLastRequestedIndex < fetchInfo_lo.pLastIndex) 1666 { 1667 fetchInfo_lo.pLastIndex = pLastRequestedIndex; 1668 } 1669 } 1670 else 1671 { 1672 fetchInfo_lo.StartVertex = work.startVertex; 1673 } 1674 1675 SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo; 1676 1677 const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); 1678 1679 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) 1680 { 1681 uint32_t i = 0; 1682 1683 simd16scalari vIndex; 1684 1685 if (IsIndexedT::value) 1686 { 1687 fetchInfo_lo.pIndices = work.pIB; 1688 fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH 1689 } 1690 else 1691 { 1692 vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale); 1693 1694 fetchInfo_lo.pIndices = (const int32_t *)&vIndex; 1695 fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH 1696 } 1697 1698 fetchInfo_lo.CurInstance = instanceNum; 1699 fetchInfo_hi.CurInstance = instanceNum; 1700 1701 vsContext_lo.InstanceID = instanceNum; 1702 vsContext_hi.InstanceID = instanceNum; 1703 1704 while (pa.HasWork()) 1705 { 1706 // GetNextVsOutput currently has the side effect of updating some PA state machine state. 1707 // So we need to keep this outside of (i < endVertex) check. 1708 1709 simdmask *pvCutIndices_lo = nullptr; 1710 simdmask *pvCutIndices_hi = nullptr; 1711 1712 if (IsIndexedT::value) 1713 { 1714 // simd16mask <=> simdmask[2] 1715 1716 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0]; 1717 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1]; 1718 } 1719 1720 simd16vertex &vout = pa.GetNextVsOutput(); 1721 1722 vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout); 1723 vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout); 1724 1725 if (i < endVertex) 1726 { 1727 if (!IsIndexedT::value) 1728 { 1729 fetchInfo_lo.pLastIndex = fetchInfo_lo.pIndices; 1730 uint32_t offset; 1731 offset = std::min(endVertex-i, (uint32_t) KNOB_SIMD16_WIDTH); 1732 #if USE_SIMD16_SHADERS 1733 fetchInfo_lo.pLastIndex += offset; 1734 #else 1735 fetchInfo_lo.pLastIndex += std::min(offset, (uint32_t) KNOB_SIMD_WIDTH); 1736 uint32_t offset2 = std::min(offset, (uint32_t) KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH; 1737 assert(offset >= 0); 1738 fetchInfo_hi.pLastIndex = fetchInfo_hi.pIndices; 1739 fetchInfo_hi.pLastIndex += offset2; 1740 #endif 1741 } 1742 // 1. Execute FS/VS for a single SIMD. 1743 AR_BEGIN(FEFetchShader, pDC->drawId); 1744 #if USE_SIMD16_SHADERS 1745 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin); 1746 #else 1747 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin_lo); 1748 1749 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH 1750 { 1751 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_hi, vin_hi); 1752 } 1753 #endif 1754 AR_END(FEFetchShader, 0); 1755 1756 // forward fetch generated vertex IDs to the vertex shader 1757 #if USE_SIMD16_SHADERS 1758 #if USE_SIMD16_VS 1759 vsContext_lo.VertexID16 = _simd16_insert_si( 1760 vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0); 1761 vsContext_lo.VertexID16 = _simd16_insert_si( 1762 vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1); 1763 #else 1764 vsContext_lo.VertexID = fetchInfo_lo.VertexID; 1765 vsContext_hi.VertexID = fetchInfo_lo.VertexID2; 1766 #endif 1767 #else 1768 vsContext_lo.VertexID = fetchInfo_lo.VertexID; 1769 vsContext_hi.VertexID = fetchInfo_hi.VertexID; 1770 #endif 1771 1772 // Setup active mask for vertex shader. 1773 #if USE_SIMD16_VS 1774 vsContext_lo.mask16 = GenerateMask16(endVertex - i); 1775 #else 1776 vsContext_lo.mask = GenerateMask(endVertex - i); 1777 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH)); 1778 #endif 1779 1780 // forward cut mask to the PA 1781 if (IsIndexedT::value) 1782 { 1783 #if USE_SIMD16_SHADERS 1784 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); 1785 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2)); 1786 #else 1787 *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); 1788 *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask)); 1789 #endif 1790 } 1791 1792 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); 1793 1794 #if KNOB_ENABLE_TOSS_POINTS 1795 if (!KNOB_TOSS_FETCH) 1796 #endif 1797 { 1798 AR_BEGIN(FEVertexShader, pDC->drawId); 1799 #if USE_SIMD16_VS 1800 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo); 1801 #else 1802 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo); 1803 1804 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH 1805 { 1806 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi); 1807 } 1808 #endif 1809 AR_END(FEVertexShader, 0); 1810 1811 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); 1812 } 1813 } 1814 1815 // 2. Assemble primitives given the last two SIMD. 1816 do 1817 { 1818 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM]; 1819 1820 RDTSC_START(FEPAAssemble); 1821 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16); 1822 RDTSC_STOP(FEPAAssemble, 1, 0); 1823 1824 #if KNOB_ENABLE_TOSS_POINTS 1825 if (!KNOB_TOSS_FETCH) 1826 #endif 1827 { 1828 #if KNOB_ENABLE_TOSS_POINTS 1829 if (!KNOB_TOSS_VS) 1830 #endif 1831 { 1832 if (assemble) 1833 { 1834 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); 1835 1836 const uint32_t numPrims = pa.NumPrims(); 1837 const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); 1838 const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; 1839 1840 const simd16scalari primID = pa.GetPrimID(work.startPrimID); 1841 const simdscalari primID_lo = _simd16_extract_si(primID, 0); 1842 const simdscalari primID_hi = _simd16_extract_si(primID, 1); 1843 1844 if (HasTessellationT::value) 1845 { 1846 pa.useAlternateOffset = false; 1847 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo); 1848 1849 if (numPrims_hi) 1850 { 1851 pa.useAlternateOffset = true; 1852 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi); 1853 } 1854 } 1855 else if (HasGeometryShaderT::value) 1856 { 1857 pa.useAlternateOffset = false; 1858 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo); 1859 1860 if (numPrims_hi) 1861 { 1862 pa.useAlternateOffset = true; 1863 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi); 1864 } 1865 } 1866 else 1867 { 1868 // If streamout is enabled then stream vertices out to memory. 1869 if (HasStreamOutT::value) 1870 { 1871 pa.useAlternateOffset = false; 1872 StreamOut(pDC, pa, workerId, pSoPrimData, 0); 1873 } 1874 1875 if (HasRastT::value) 1876 { 1877 SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16); 1878 // Gather data from the SVG if provided. 1879 simd16scalari vpai = SIMD16::setzero_si(); 1880 simd16scalari rtai = SIMD16::setzero_si(); 1881 SIMD16::Vec4 svgAttrib[4]; 1882 1883 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) 1884 { 1885 pa.Assemble(VERTEX_SGV_SLOT, svgAttrib); 1886 } 1887 1888 1889 if (state.backendState.readViewportArrayIndex) 1890 { 1891 vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); 1892 pa.viewportArrayActive = true; 1893 } 1894 if (state.backendState.readRenderTargetArrayIndex) 1895 { 1896 rtai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); 1897 pa.rtArrayActive = true; 1898 } 1899 1900 { 1901 // OOB VPAI indices => forced to zero. 1902 vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si()); 1903 simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); 1904 simd16scalari vClearMask = SIMD16::cmplt_epi32(vpai, vNumViewports); 1905 vpai = SIMD16::and_si(vClearMask, vpai); 1906 1907 pa.useAlternateOffset = false; 1908 pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, vpai, rtai); 1909 } 1910 } 1911 } 1912 } 1913 } 1914 } 1915 } while (pa.NextPrim()); 1916 1917 if (IsIndexedT::value) 1918 { 1919 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize); 1920 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize); 1921 } 1922 else 1923 { 1924 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH)); 1925 } 1926 1927 i += KNOB_SIMD16_WIDTH; 1928 } 1929 1930 pa.Reset(); 1931 } 1932 1933 #else 1934 SWR_VS_CONTEXT vsContext; 1935 SWR_FETCH_CONTEXT fetchInfo = { 0 }; 1936 1937 fetchInfo.pStreams = &state.vertexBuffers[0]; 1938 fetchInfo.StartInstance = work.startInstance; 1939 fetchInfo.StartVertex = 0; 1940 1941 if (IsIndexedT::value) 1942 { 1943 fetchInfo.BaseVertex = work.baseVertex; 1944 1945 // if the entire index buffer isn't being consumed, set the last index 1946 // so that fetches < a SIMD wide will be masked off 1947 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); 1948 if (pLastRequestedIndex < fetchInfo.pLastIndex) 1949 { 1950 fetchInfo.pLastIndex = pLastRequestedIndex; 1951 } 1952 } 1953 else 1954 { 1955 fetchInfo.StartVertex = work.startVertex; 1956 } 1957 1958 const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 1959 1960 /// @todo: temporarily move instance loop in the FE to ensure SO ordering 1961 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) 1962 { 1963 simdscalari vIndex; 1964 uint32_t i = 0; 1965 1966 if (IsIndexedT::value) 1967 { 1968 fetchInfo.pIndices = work.pIB; 1969 } 1970 else 1971 { 1972 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale); 1973 fetchInfo.pIndices = (const int32_t*)&vIndex; 1974 } 1975 1976 fetchInfo.CurInstance = instanceNum; 1977 vsContext.InstanceID = instanceNum; 1978 1979 while (pa.HasWork()) 1980 { 1981 // GetNextVsOutput currently has the side effect of updating some PA state machine state. 1982 // So we need to keep this outside of (i < endVertex) check. 1983 simdmask* pvCutIndices = nullptr; 1984 if (IsIndexedT::value) 1985 { 1986 pvCutIndices = &pa.GetNextVsIndices(); 1987 } 1988 1989 simdvertex& vout = pa.GetNextVsOutput(); 1990 vsContext.pVin = &vout; 1991 vsContext.pVout = &vout; 1992 1993 if (i < endVertex) 1994 { 1995 1996 // 1. Execute FS/VS for a single SIMD. 1997 AR_BEGIN(FEFetchShader, pDC->drawId); 1998 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo, vout); 1999 AR_END(FEFetchShader, 0); 2000 2001 // forward fetch generated vertex IDs to the vertex shader 2002 vsContext.VertexID = fetchInfo.VertexID; 2003 2004 // Setup active mask for vertex shader. 2005 vsContext.mask = GenerateMask(endVertex - i); 2006 2007 // forward cut mask to the PA 2008 if (IsIndexedT::value) 2009 { 2010 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask)); 2011 } 2012 2013 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); 2014 2015 #if KNOB_ENABLE_TOSS_POINTS 2016 if (!KNOB_TOSS_FETCH) 2017 #endif 2018 { 2019 AR_BEGIN(FEVertexShader, pDC->drawId); 2020 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext); 2021 AR_END(FEVertexShader, 0); 2022 2023 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); 2024 } 2025 } 2026 2027 // 2. Assemble primitives given the last two SIMD. 2028 do 2029 { 2030 simdvector prim[MAX_NUM_VERTS_PER_PRIM]; 2031 // PaAssemble returns false if there is not enough verts to assemble. 2032 AR_BEGIN(FEPAAssemble, pDC->drawId); 2033 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim); 2034 AR_END(FEPAAssemble, 1); 2035 2036 #if KNOB_ENABLE_TOSS_POINTS 2037 if (!KNOB_TOSS_FETCH) 2038 #endif 2039 { 2040 #if KNOB_ENABLE_TOSS_POINTS 2041 if (!KNOB_TOSS_VS) 2042 #endif 2043 { 2044 if (assemble) 2045 { 2046 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); 2047 2048 if (HasTessellationT::value) 2049 { 2050 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( 2051 pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID)); 2052 } 2053 else if (HasGeometryShaderT::value) 2054 { 2055 GeometryShaderStage<HasStreamOutT, HasRastT>( 2056 pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID)); 2057 } 2058 else 2059 { 2060 // If streamout is enabled then stream vertices out to memory. 2061 if (HasStreamOutT::value) 2062 { 2063 StreamOut(pDC, pa, workerId, pSoPrimData, 0); 2064 } 2065 2066 if (HasRastT::value) 2067 { 2068 SWR_ASSERT(pDC->pState->pfnProcessPrims); 2069 2070 // Gather data from the SVG if provided. 2071 simdscalari vViewportIdx = SIMD16::setzero_si(); 2072 simdscalari vRtIdx = SIMD16::setzero_si(); 2073 SIMD8::Vec4 svgAttrib[4]; 2074 2075 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex) 2076 { 2077 tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib); 2078 } 2079 2080 if (state.backendState.readViewportArrayIndex) 2081 { 2082 vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]); 2083 2084 // OOB VPAI indices => forced to zero. 2085 vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si()); 2086 simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); 2087 simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports); 2088 vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx); 2089 tessPa.viewportArrayActive = true; 2090 } 2091 if (state.backendState.readRenderTargetArrayIndex) 2092 { 2093 vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]); 2094 tessPa.rtArrayActive = true; 2095 } 2096 2097 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, 2098 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), vViewportIdx, vRtIdx); 2099 } 2100 } 2101 } 2102 } 2103 } 2104 } while (pa.NextPrim()); 2105 2106 if (IsIndexedT::value) 2107 { 2108 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); 2109 } 2110 else 2111 { 2112 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH)); 2113 } 2114 2115 i += KNOB_SIMD_WIDTH; 2116 } 2117 pa.Reset(); 2118 } 2119 2120 #endif 2121 2122 AR_END(FEProcessDraw, numPrims * work.numInstances); 2123 } 2124 2125 struct FEDrawChooser 2126 { 2127 typedef PFN_FE_WORK_FUNC FuncType; 2128 2129 template <typename... ArgsB> 2130 static FuncType GetFunc() 2131 { 2132 return ProcessDraw<ArgsB...>; 2133 } 2134 }; 2135 2136 2137 // Selector for correct templated Draw front-end function 2138 PFN_FE_WORK_FUNC GetProcessDrawFunc( 2139 bool IsIndexed, 2140 bool IsCutIndexEnabled, 2141 bool HasTessellation, 2142 bool HasGeometryShader, 2143 bool HasStreamOut, 2144 bool HasRasterization) 2145 { 2146 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization); 2147 } 2148