1 /**************************************************************************** 2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file api.cpp 24 * 25 * @brief API implementation 26 * 27 ******************************************************************************/ 28 29 #include <cfloat> 30 #include <cmath> 31 #include <cstdio> 32 #include <new> 33 34 #include "core/api.h" 35 #include "core/backend.h" 36 #include "core/context.h" 37 #include "core/depthstencil.h" 38 #include "core/frontend.h" 39 #include "core/rasterizer.h" 40 #include "core/rdtsc_core.h" 41 #include "core/threads.h" 42 #include "core/tilemgr.h" 43 #include "core/clip.h" 44 #include "core/utils.h" 45 46 #include "common/simdintrin.h" 47 #include "common/os.h" 48 49 static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y }; 50 51 void SetupDefaultState(SWR_CONTEXT *pContext); 52 53 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) 54 { 55 return (SWR_CONTEXT*)hContext; 56 } 57 58 void WakeAllThreads(SWR_CONTEXT *pContext) 59 { 60 pContext->FifosNotEmpty.notify_all(); 61 } 62 63 ////////////////////////////////////////////////////////////////////////// 64 /// @brief Create SWR Context. 65 /// @param pCreateInfo - pointer to creation info. 66 HANDLE SwrCreateContext( 67 SWR_CREATECONTEXT_INFO* pCreateInfo) 68 { 69 RDTSC_RESET(); 70 RDTSC_INIT(0); 71 72 void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4); 73 memset(pContextMem, 0, sizeof(SWR_CONTEXT)); 74 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT(); 75 76 pContext->privateStateSize = pCreateInfo->privateStateSize; 77 78 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); 79 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); 80 81 pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); 82 pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); 83 84 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) 85 { 86 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); 87 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); 88 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue(); 89 90 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); 91 } 92 93 pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; 94 pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; 95 pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; 96 pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; 97 pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; 98 99 if (pCreateInfo->pThreadInfo) 100 { 101 pContext->threadInfo = *pCreateInfo->pThreadInfo; 102 } 103 104 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); 105 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); 106 new (&pContext->WaitLock) std::mutex(); 107 new (&pContext->FifosNotEmpty) std::condition_variable(); 108 109 CreateThreadPool(pContext, &pContext->threadPool); 110 111 pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads]; 112 pContext->pStats = new SWR_STATS[pContext->NumWorkerThreads]; 113 114 #if defined(KNOB_ENABLE_AR) 115 // Setup ArchRast thread contexts which includes +1 for API thread. 116 pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1]; 117 pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API); 118 #endif 119 120 // Allocate scratch space for workers. 121 ///@note We could lazily allocate this but its rather small amount of memory. 122 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) 123 { 124 #if defined(_WIN32) 125 uint32_t numaNode = pContext->threadPool.pThreadData ? 126 pContext->threadPool.pThreadData[i].numaId : 0; 127 pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma( 128 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE), 129 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, 130 numaNode); 131 #else 132 pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); 133 #endif 134 135 #if defined(KNOB_ENABLE_AR) 136 // Initialize worker thread context for ArchRast. 137 pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER); 138 #endif 139 } 140 141 // State setup AFTER context is fully initialized 142 SetupDefaultState(pContext); 143 144 // initialize hot tile manager 145 pContext->pHotTileMgr = new HotTileMgr(); 146 147 // initialize function pointer tables 148 InitClearTilesTable(); 149 150 // initialize callback functions 151 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; 152 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; 153 pContext->pfnClearTile = pCreateInfo->pfnClearTile; 154 pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; 155 pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; 156 pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; 157 158 159 // pass pointer to bucket manager back to caller 160 #ifdef KNOB_ENABLE_RDTSC 161 pCreateInfo->pBucketMgr = &gBucketMgr; 162 #endif 163 164 pCreateInfo->contextSaveSize = sizeof(API_STATE); 165 166 StartThreadPool(pContext, &pContext->threadPool); 167 168 return (HANDLE)pContext; 169 } 170 171 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) 172 { 173 memcpy(&dst.state, &src.state, sizeof(API_STATE)); 174 } 175 176 template<bool IsDraw> 177 void QueueWork(SWR_CONTEXT *pContext) 178 { 179 DRAW_CONTEXT* pDC = pContext->pCurDrawContext; 180 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; 181 182 if (IsDraw) 183 { 184 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex]; 185 pDC->pTileMgr->initialize(); 186 } 187 188 // Each worker thread looks at a DC for both FE and BE work at different times and so we 189 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers 190 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and 191 // then moved on if all work is done.) 192 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads; 193 194 if (IsDraw) 195 { 196 InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE); 197 } 198 199 _ReadWriteBarrier(); 200 { 201 std::unique_lock<std::mutex> lock(pContext->WaitLock); 202 pContext->dcRing.Enqueue(); 203 } 204 205 if (pContext->threadInfo.SINGLE_THREADED) 206 { 207 // flush denormals to 0 208 uint32_t mxcsr = _mm_getcsr(); 209 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); 210 211 if (IsDraw) 212 { 213 uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; 214 WorkOnFifoFE(pContext, 0, curDraw[0]); 215 WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0); 216 } 217 else 218 { 219 uint32_t curDispatch = pContext->pCurDrawContext->drawId; 220 WorkOnCompute(pContext, 0, curDispatch); 221 } 222 223 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers). 224 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {} 225 226 // restore csr 227 _mm_setcsr(mxcsr); 228 } 229 else 230 { 231 AR_API_BEGIN(APIDrawWakeAllThreads, pDC->drawId); 232 WakeAllThreads(pContext); 233 AR_API_END(APIDrawWakeAllThreads, 1); 234 } 235 236 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. 237 pContext->pPrevDrawContext = pContext->pCurDrawContext; 238 pContext->pCurDrawContext = nullptr; 239 } 240 241 INLINE void QueueDraw(SWR_CONTEXT* pContext) 242 { 243 QueueWork<true>(pContext); 244 } 245 246 INLINE void QueueDispatch(SWR_CONTEXT* pContext) 247 { 248 QueueWork<false>(pContext); 249 } 250 251 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) 252 { 253 AR_API_BEGIN(APIGetDrawContext, 0); 254 // If current draw context is null then need to obtain a new draw context to use from ring. 255 if (pContext->pCurDrawContext == nullptr) 256 { 257 // Need to wait for a free entry. 258 while (pContext->dcRing.IsFull()) 259 { 260 _mm_pause(); 261 } 262 263 uint64_t curDraw = pContext->dcRing.GetHead(); 264 uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; 265 266 if ((pContext->frameCount - pContext->lastFrameChecked) > 2 || 267 (curDraw - pContext->lastDrawChecked) > 0x10000) 268 { 269 // Take this opportunity to clean-up old arena allocations 270 pContext->cachingArenaAllocator.FreeOldBlocks(); 271 272 pContext->lastFrameChecked = pContext->frameCount; 273 pContext->lastDrawChecked = curDraw; 274 } 275 276 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; 277 pContext->pCurDrawContext = pCurDrawContext; 278 279 // Assign next available entry in DS ring to this DC. 280 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; 281 pCurDrawContext->pState = &pContext->dsRing[dsIndex]; 282 283 // Copy previous state to current state. 284 if (pContext->pPrevDrawContext) 285 { 286 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; 287 288 // If we're splitting our draw then we can just use the same state from the previous 289 // draw. In this case, we won't increment the DS ring index so the next non-split 290 // draw can receive the state. 291 if (isSplitDraw == false) 292 { 293 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); 294 295 // Should have been cleaned up previously 296 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); 297 298 pCurDrawContext->pState->pPrivateState = nullptr; 299 300 pContext->curStateId++; // Progress state ring index forward. 301 } 302 else 303 { 304 // If its a split draw then just copy the state pointer over 305 // since its the same draw. 306 pCurDrawContext->pState = pPrevDrawContext->pState; 307 SWR_ASSERT(pPrevDrawContext->cleanupState == false); 308 } 309 } 310 else 311 { 312 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); 313 pContext->curStateId++; // Progress state ring index forward. 314 } 315 316 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true); 317 318 // Reset dependency 319 pCurDrawContext->dependent = false; 320 pCurDrawContext->dependentFE = false; 321 322 pCurDrawContext->pContext = pContext; 323 pCurDrawContext->isCompute = false; // Dispatch has to set this to true. 324 325 pCurDrawContext->doneFE = false; 326 pCurDrawContext->FeLock = 0; 327 pCurDrawContext->threadsDone = 0; 328 pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr; 329 330 pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads); 331 332 // Assign unique drawId for this DC 333 pCurDrawContext->drawId = pContext->dcRing.GetHead(); 334 335 pCurDrawContext->cleanupState = true; 336 337 } 338 else 339 { 340 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); 341 } 342 343 AR_API_END(APIGetDrawContext, 0); 344 return pContext->pCurDrawContext; 345 } 346 347 API_STATE* GetDrawState(SWR_CONTEXT *pContext) 348 { 349 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 350 SWR_ASSERT(pDC->pState != nullptr); 351 352 return &pDC->pState->state; 353 } 354 355 void SwrDestroyContext(HANDLE hContext) 356 { 357 SWR_CONTEXT *pContext = GetContext(hContext); 358 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 359 360 pDC->FeWork.type = SHUTDOWN; 361 pDC->FeWork.pfnWork = ProcessShutdown; 362 363 //enqueue 364 QueueDraw(pContext); 365 366 DestroyThreadPool(pContext, &pContext->threadPool); 367 368 // free the fifos 369 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) 370 { 371 delete[] pContext->dcRing[i].dynState.pStats; 372 delete pContext->dcRing[i].pArena; 373 delete pContext->dsRing[i].pArena; 374 pContext->pMacroTileManagerArray[i].~MacroTileMgr(); 375 pContext->pDispatchQueueArray[i].~DispatchQueue(); 376 } 377 378 AlignedFree(pContext->pDispatchQueueArray); 379 AlignedFree(pContext->pMacroTileManagerArray); 380 381 // Free scratch space. 382 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) 383 { 384 #if defined(_WIN32) 385 VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE); 386 #else 387 AlignedFree(pContext->ppScratch[i]); 388 #endif 389 390 #if defined(KNOB_ENABLE_AR) 391 ArchRast::DestroyThreadContext(pContext->pArContext[i]); 392 #endif 393 } 394 395 delete[] pContext->ppScratch; 396 delete[] pContext->pStats; 397 398 delete(pContext->pHotTileMgr); 399 400 pContext->~SWR_CONTEXT(); 401 AlignedFree(GetContext(hContext)); 402 } 403 404 void SWR_API SwrSaveState( 405 HANDLE hContext, 406 void* pOutputStateBlock, 407 size_t memSize) 408 { 409 SWR_CONTEXT *pContext = GetContext(hContext); 410 auto pSrc = GetDrawState(pContext); 411 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc)); 412 413 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc)); 414 } 415 416 void SWR_API SwrRestoreState( 417 HANDLE hContext, 418 const void* pStateBlock, 419 size_t memSize) 420 { 421 SWR_CONTEXT *pContext = GetContext(hContext); 422 auto pDst = GetDrawState(pContext); 423 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst)); 424 425 memcpy(pDst, pStateBlock, sizeof(*pDst)); 426 } 427 428 void SetupDefaultState(SWR_CONTEXT *pContext) 429 { 430 API_STATE* pState = GetDrawState(pContext); 431 432 pState->rastState.cullMode = SWR_CULLMODE_NONE; 433 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW; 434 435 pState->depthBoundsState.depthBoundsTestEnable = false; 436 pState->depthBoundsState.depthBoundsTestMinValue = 0.0f; 437 pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f; 438 } 439 440 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3) 441 { 442 SWR_ASSERT(pfnFunc != nullptr); 443 444 SWR_CONTEXT *pContext = GetContext(hContext); 445 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 446 447 AR_API_BEGIN(APISync, 0); 448 449 pDC->FeWork.type = SYNC; 450 pDC->FeWork.pfnWork = ProcessSync; 451 452 // Setup callback function 453 pDC->retireCallback.pfnCallbackFunc = pfnFunc; 454 pDC->retireCallback.userData = userData; 455 pDC->retireCallback.userData2 = userData2; 456 pDC->retireCallback.userData3 = userData3; 457 458 //enqueue 459 QueueDraw(pContext); 460 461 AR_API_END(APISync, 1); 462 } 463 464 void SwrWaitForIdle(HANDLE hContext) 465 { 466 SWR_CONTEXT *pContext = GetContext(hContext); 467 468 AR_API_BEGIN(APIWaitForIdle, 0); 469 470 while (!pContext->dcRing.IsEmpty()) 471 { 472 _mm_pause(); 473 } 474 475 AR_API_END(APIWaitForIdle, 1); 476 } 477 478 void SwrWaitForIdleFE(HANDLE hContext) 479 { 480 SWR_CONTEXT *pContext = GetContext(hContext); 481 482 AR_API_BEGIN(APIWaitForIdle, 0); 483 484 while (pContext->drawsOutstandingFE > 0) 485 { 486 _mm_pause(); 487 } 488 489 AR_API_END(APIWaitForIdle, 1); 490 } 491 492 void SwrSetVertexBuffers( 493 HANDLE hContext, 494 uint32_t numBuffers, 495 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers) 496 { 497 API_STATE* pState = GetDrawState(GetContext(hContext)); 498 499 for (uint32_t i = 0; i < numBuffers; ++i) 500 { 501 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i]; 502 pState->vertexBuffers[pVB->index] = *pVB; 503 } 504 } 505 506 void SwrSetIndexBuffer( 507 HANDLE hContext, 508 const SWR_INDEX_BUFFER_STATE* pIndexBuffer) 509 { 510 API_STATE* pState = GetDrawState(GetContext(hContext)); 511 512 pState->indexBuffer = *pIndexBuffer; 513 } 514 515 void SwrSetFetchFunc( 516 HANDLE hContext, 517 PFN_FETCH_FUNC pfnFetchFunc) 518 { 519 API_STATE* pState = GetDrawState(GetContext(hContext)); 520 521 pState->pfnFetchFunc = pfnFetchFunc; 522 } 523 524 void SwrSetSoFunc( 525 HANDLE hContext, 526 PFN_SO_FUNC pfnSoFunc, 527 uint32_t streamIndex) 528 { 529 API_STATE* pState = GetDrawState(GetContext(hContext)); 530 531 SWR_ASSERT(streamIndex < MAX_SO_STREAMS); 532 533 pState->pfnSoFunc[streamIndex] = pfnSoFunc; 534 } 535 536 void SwrSetSoState( 537 HANDLE hContext, 538 SWR_STREAMOUT_STATE* pSoState) 539 { 540 API_STATE* pState = GetDrawState(GetContext(hContext)); 541 542 pState->soState = *pSoState; 543 } 544 545 void SwrSetSoBuffers( 546 HANDLE hContext, 547 SWR_STREAMOUT_BUFFER* pSoBuffer, 548 uint32_t slot) 549 { 550 API_STATE* pState = GetDrawState(GetContext(hContext)); 551 552 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); 553 554 pState->soBuffer[slot] = *pSoBuffer; 555 } 556 557 void SwrSetVertexFunc( 558 HANDLE hContext, 559 PFN_VERTEX_FUNC pfnVertexFunc) 560 { 561 API_STATE* pState = GetDrawState(GetContext(hContext)); 562 563 pState->pfnVertexFunc = pfnVertexFunc; 564 } 565 566 void SwrSetFrontendState( 567 HANDLE hContext, 568 SWR_FRONTEND_STATE *pFEState) 569 { 570 API_STATE* pState = GetDrawState(GetContext(hContext)); 571 pState->frontendState = *pFEState; 572 } 573 574 void SwrSetGsState( 575 HANDLE hContext, 576 SWR_GS_STATE *pGSState) 577 { 578 API_STATE* pState = GetDrawState(GetContext(hContext)); 579 pState->gsState = *pGSState; 580 } 581 582 void SwrSetGsFunc( 583 HANDLE hContext, 584 PFN_GS_FUNC pfnGsFunc) 585 { 586 API_STATE* pState = GetDrawState(GetContext(hContext)); 587 pState->pfnGsFunc = pfnGsFunc; 588 } 589 590 void SwrSetCsFunc( 591 HANDLE hContext, 592 PFN_CS_FUNC pfnCsFunc, 593 uint32_t totalThreadsInGroup, 594 uint32_t totalSpillFillSize) 595 { 596 API_STATE* pState = GetDrawState(GetContext(hContext)); 597 pState->pfnCsFunc = pfnCsFunc; 598 pState->totalThreadsInGroup = totalThreadsInGroup; 599 pState->totalSpillFillSize = totalSpillFillSize; 600 } 601 602 void SwrSetTsState( 603 HANDLE hContext, 604 SWR_TS_STATE *pState) 605 { 606 API_STATE* pApiState = GetDrawState(GetContext(hContext)); 607 pApiState->tsState = *pState; 608 } 609 610 void SwrSetHsFunc( 611 HANDLE hContext, 612 PFN_HS_FUNC pfnFunc) 613 { 614 API_STATE* pApiState = GetDrawState(GetContext(hContext)); 615 pApiState->pfnHsFunc = pfnFunc; 616 } 617 618 void SwrSetDsFunc( 619 HANDLE hContext, 620 PFN_DS_FUNC pfnFunc) 621 { 622 API_STATE* pApiState = GetDrawState(GetContext(hContext)); 623 pApiState->pfnDsFunc = pfnFunc; 624 } 625 626 void SwrSetDepthStencilState( 627 HANDLE hContext, 628 SWR_DEPTH_STENCIL_STATE *pDSState) 629 { 630 API_STATE* pState = GetDrawState(GetContext(hContext)); 631 632 pState->depthStencilState = *pDSState; 633 } 634 635 void SwrSetBackendState( 636 HANDLE hContext, 637 SWR_BACKEND_STATE *pBEState) 638 { 639 API_STATE* pState = GetDrawState(GetContext(hContext)); 640 641 pState->backendState = *pBEState; 642 } 643 644 void SwrSetDepthBoundsState( 645 HANDLE hContext, 646 SWR_DEPTH_BOUNDS_STATE *pDBState) 647 { 648 API_STATE* pState = GetDrawState(GetContext(hContext)); 649 650 pState->depthBoundsState = *pDBState; 651 } 652 653 void SwrSetPixelShaderState( 654 HANDLE hContext, 655 SWR_PS_STATE *pPSState) 656 { 657 API_STATE *pState = GetDrawState(GetContext(hContext)); 658 pState->psState = *pPSState; 659 } 660 661 void SwrSetBlendState( 662 HANDLE hContext, 663 SWR_BLEND_STATE *pBlendState) 664 { 665 API_STATE *pState = GetDrawState(GetContext(hContext)); 666 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE)); 667 } 668 669 void SwrSetBlendFunc( 670 HANDLE hContext, 671 uint32_t renderTarget, 672 PFN_BLEND_JIT_FUNC pfnBlendFunc) 673 { 674 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS); 675 API_STATE *pState = GetDrawState(GetContext(hContext)); 676 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; 677 } 678 679 // update guardband multipliers for the viewport 680 void updateGuardbands(API_STATE *pState) 681 { 682 uint32_t numGbs = pState->gsState.emitsRenderTargetArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; 683 684 for(uint32_t i = 0; i < numGbs; ++i) 685 { 686 // guardband center is viewport center 687 pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; 688 pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; 689 pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; 690 pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; 691 } 692 } 693 694 void SwrSetRastState( 695 HANDLE hContext, 696 const SWR_RASTSTATE *pRastState) 697 { 698 SWR_CONTEXT *pContext = GetContext(hContext); 699 API_STATE* pState = GetDrawState(pContext); 700 701 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE)); 702 } 703 704 void SwrSetViewports( 705 HANDLE hContext, 706 uint32_t numViewports, 707 const SWR_VIEWPORT* pViewports, 708 const SWR_VIEWPORT_MATRICES* pMatrices) 709 { 710 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, 711 "Invalid number of viewports."); 712 713 SWR_CONTEXT *pContext = GetContext(hContext); 714 API_STATE* pState = GetDrawState(pContext); 715 716 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); 717 // @todo Faster to copy portions of the SOA or just copy all of it? 718 memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES)); 719 720 updateGuardbands(pState); 721 } 722 723 void SwrSetScissorRects( 724 HANDLE hContext, 725 uint32_t numScissors, 726 const SWR_RECT* pScissors) 727 { 728 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, 729 "Invalid number of scissor rects."); 730 731 API_STATE* pState = GetDrawState(GetContext(hContext)); 732 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0])); 733 }; 734 735 void SetupMacroTileScissors(DRAW_CONTEXT *pDC) 736 { 737 API_STATE *pState = &pDC->pState->state; 738 uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; 739 pState->scissorsTileAligned = true; 740 741 for (uint32_t index = 0; index < numScissors; ++index) 742 { 743 SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index]; 744 745 // Set up scissor dimensions based on scissor or viewport 746 if (pState->rastState.scissorEnable) 747 { 748 scissorInFixedPoint = pState->scissorRects[index]; 749 } 750 else 751 { 752 // the vp width and height must be added to origin un-rounded then the result round to -inf. 753 // The cast to int works for rounding assuming all [left, right, top, bottom] are positive. 754 scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x; 755 scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width); 756 scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y; 757 scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height); 758 } 759 760 // Clamp to max rect 761 scissorInFixedPoint &= g_MaxScissorRect; 762 763 // Test for tile alignment 764 bool tileAligned; 765 tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0; 766 tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0; 767 tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0; 768 tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0; 769 770 pState->scissorsTileAligned &= tileAligned; 771 772 // Scale to fixed point 773 scissorInFixedPoint.xmin *= FIXED_POINT_SCALE; 774 scissorInFixedPoint.xmax *= FIXED_POINT_SCALE; 775 scissorInFixedPoint.ymin *= FIXED_POINT_SCALE; 776 scissorInFixedPoint.ymax *= FIXED_POINT_SCALE; 777 778 // Make scissor inclusive 779 scissorInFixedPoint.xmax -= 1; 780 scissorInFixedPoint.ymax -= 1; 781 } 782 } 783 784 // templated backend function tables 785 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; 786 extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2]; 787 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2]; 788 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]; 789 void SetupPipeline(DRAW_CONTEXT *pDC) 790 { 791 SWR_CONTEXT* pContext = pDC->pContext; 792 DRAW_STATE* pState = pDC->pState; 793 const SWR_RASTSTATE &rastState = pState->state.rastState; 794 const SWR_PS_STATE &psState = pState->state.psState; 795 BACKEND_FUNCS& backendFuncs = pState->backendFuncs; 796 const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0; 797 798 // setup backend 799 if (psState.pfnPixelShader == nullptr) 800 { 801 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount]; 802 } 803 else 804 { 805 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0; 806 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; 807 const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0; 808 809 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; 810 811 // select backend function 812 switch(psState.shadingRate) 813 { 814 case SWR_SHADING_RATE_PIXEL: 815 if(bMultisampleEnable) 816 { 817 // always need to generate I & J per sample for Z interpolation 818 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); 819 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ]; 820 } 821 else 822 { 823 // always need to generate I & J per pixel for Z interpolation 824 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); 825 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ]; 826 } 827 break; 828 case SWR_SHADING_RATE_SAMPLE: 829 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); 830 // always need to generate I & J per sample for Z interpolation 831 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); 832 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ]; 833 break; 834 default: 835 SWR_ASSERT(0 && "Invalid shading rate"); 836 break; 837 } 838 } 839 840 PFN_PROCESS_PRIMS pfnBinner; 841 switch (pState->state.topology) 842 { 843 case TOP_POINT_LIST: 844 pState->pfnProcessPrims = ClipPoints; 845 pfnBinner = BinPoints; 846 break; 847 case TOP_LINE_LIST: 848 case TOP_LINE_STRIP: 849 case TOP_LINE_LOOP: 850 case TOP_LINE_LIST_ADJ: 851 case TOP_LISTSTRIP_ADJ: 852 pState->pfnProcessPrims = ClipLines; 853 pfnBinner = BinLines; 854 break; 855 default: 856 pState->pfnProcessPrims = ClipTriangles; 857 pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0)); 858 break; 859 }; 860 861 862 // disable clipper if viewport transform is disabled 863 if (pState->state.frontendState.vpTransformDisable) 864 { 865 pState->pfnProcessPrims = pfnBinner; 866 } 867 868 if ((pState->state.psState.pfnPixelShader == nullptr) && 869 (pState->state.depthStencilState.depthTestEnable == FALSE) && 870 (pState->state.depthStencilState.depthWriteEnable == FALSE) && 871 (pState->state.depthStencilState.stencilTestEnable == FALSE) && 872 (pState->state.depthStencilState.stencilWriteEnable == FALSE) && 873 (pState->state.backendState.numAttributes == 0)) 874 { 875 pState->pfnProcessPrims = nullptr; 876 } 877 878 if (pState->state.soState.rasterizerDisable == true) 879 { 880 pState->pfnProcessPrims = nullptr; 881 } 882 883 884 // set up the frontend attribute count 885 pState->state.feNumAttributes = 0; 886 const SWR_BACKEND_STATE& backendState = pState->state.backendState; 887 if (backendState.swizzleEnable) 888 { 889 // attribute swizzling is enabled, iterate over the map and record the max attribute used 890 for (uint32_t i = 0; i < backendState.numAttributes; ++i) 891 { 892 pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1); 893 } 894 } 895 else 896 { 897 pState->state.feNumAttributes = pState->state.backendState.numAttributes; 898 } 899 900 if (pState->state.soState.soEnable) 901 { 902 uint32_t streamMasks = 0; 903 for (uint32_t i = 0; i < 4; ++i) 904 { 905 streamMasks |= pState->state.soState.streamMasks[i]; 906 } 907 908 DWORD maxAttrib; 909 if (_BitScanReverse(&maxAttrib, streamMasks)) 910 { 911 pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1)); 912 } 913 } 914 915 // complicated logic to test for cases where we don't need backing hottile memory for a draw 916 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled. 917 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable && 918 !pState->state.depthStencilState.depthWriteEnable && 919 !pState->state.depthBoundsState.depthBoundsTestEnable && 920 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && 921 (pState->state.depthStencilState.depthTestEnable || 922 pState->state.depthStencilState.depthWriteEnable || 923 pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false; 924 925 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable && 926 !pState->state.depthStencilState.stencilWriteEnable && 927 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) || 928 // for stencil we have to check the double sided state as well 929 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable && 930 !pState->state.depthStencilState.stencilWriteEnable && 931 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) && 932 (pState->state.depthStencilState.stencilTestEnable || 933 pState->state.depthStencilState.stencilWriteEnable)) ? true : false; 934 935 uint32_t numRTs = pState->state.psState.numRenderTargets; 936 pState->state.colorHottileEnable = 0; 937 if (psState.pfnPixelShader != nullptr) 938 { 939 for (uint32_t rt = 0; rt < numRTs; ++rt) 940 { 941 pState->state.colorHottileEnable |= 942 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha || 943 !pState->state.blendState.renderTarget[rt].writeDisableRed || 944 !pState->state.blendState.renderTarget[rt].writeDisableGreen || 945 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0; 946 } 947 } 948 949 // Setup depth quantization function 950 if (pState->state.depthHottileEnable) 951 { 952 switch (pState->state.rastState.depthFormat) 953 { 954 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break; 955 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break; 956 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break; 957 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break; 958 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion."); 959 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; 960 } 961 } 962 else 963 { 964 // set up pass-through quantize if depth isn't enabled 965 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; 966 } 967 } 968 969 ////////////////////////////////////////////////////////////////////////// 970 /// @brief InitDraw 971 /// @param pDC - Draw context to initialize for this draw. 972 void InitDraw( 973 DRAW_CONTEXT *pDC, 974 bool isSplitDraw) 975 { 976 // We don't need to re-setup the scissors/pipeline state again for split draw. 977 if (isSplitDraw == false) 978 { 979 SetupMacroTileScissors(pDC); 980 SetupPipeline(pDC); 981 } 982 983 984 } 985 986 ////////////////////////////////////////////////////////////////////////// 987 /// @brief We can split the draw for certain topologies for better performance. 988 /// @param totalVerts - Total vertices for draw 989 /// @param topology - Topology used for draw 990 uint32_t MaxVertsPerDraw( 991 DRAW_CONTEXT* pDC, 992 uint32_t totalVerts, 993 PRIMITIVE_TOPOLOGY topology) 994 { 995 API_STATE& state = pDC->pState->state; 996 997 uint32_t vertsPerDraw = totalVerts; 998 999 if (state.soState.soEnable) 1000 { 1001 return totalVerts; 1002 } 1003 1004 switch (topology) 1005 { 1006 case TOP_POINT_LIST: 1007 case TOP_TRIANGLE_LIST: 1008 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW; 1009 break; 1010 1011 case TOP_PATCHLIST_1: 1012 case TOP_PATCHLIST_2: 1013 case TOP_PATCHLIST_3: 1014 case TOP_PATCHLIST_4: 1015 case TOP_PATCHLIST_5: 1016 case TOP_PATCHLIST_6: 1017 case TOP_PATCHLIST_7: 1018 case TOP_PATCHLIST_8: 1019 case TOP_PATCHLIST_9: 1020 case TOP_PATCHLIST_10: 1021 case TOP_PATCHLIST_11: 1022 case TOP_PATCHLIST_12: 1023 case TOP_PATCHLIST_13: 1024 case TOP_PATCHLIST_14: 1025 case TOP_PATCHLIST_15: 1026 case TOP_PATCHLIST_16: 1027 case TOP_PATCHLIST_17: 1028 case TOP_PATCHLIST_18: 1029 case TOP_PATCHLIST_19: 1030 case TOP_PATCHLIST_20: 1031 case TOP_PATCHLIST_21: 1032 case TOP_PATCHLIST_22: 1033 case TOP_PATCHLIST_23: 1034 case TOP_PATCHLIST_24: 1035 case TOP_PATCHLIST_25: 1036 case TOP_PATCHLIST_26: 1037 case TOP_PATCHLIST_27: 1038 case TOP_PATCHLIST_28: 1039 case TOP_PATCHLIST_29: 1040 case TOP_PATCHLIST_30: 1041 case TOP_PATCHLIST_31: 1042 case TOP_PATCHLIST_32: 1043 if (pDC->pState->state.tsState.tsEnable) 1044 { 1045 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE; 1046 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW; 1047 } 1048 break; 1049 1050 // The Primitive Assembly code can only handle 1 RECT at a time. 1051 case TOP_RECT_LIST: 1052 vertsPerDraw = 3; 1053 break; 1054 1055 default: 1056 // We are not splitting up draws for other topologies. 1057 break; 1058 } 1059 1060 return vertsPerDraw; 1061 } 1062 1063 1064 ////////////////////////////////////////////////////////////////////////// 1065 /// @brief DrawInstanced 1066 /// @param hContext - Handle passed back from SwrCreateContext 1067 /// @param topology - Specifies topology for draw. 1068 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance). 1069 /// @param startVertex - Specifies start vertex for draw. (vertex data) 1070 /// @param numInstances - How many instances to render. 1071 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) 1072 void DrawInstanced( 1073 HANDLE hContext, 1074 PRIMITIVE_TOPOLOGY topology, 1075 uint32_t numVertices, 1076 uint32_t startVertex, 1077 uint32_t numInstances = 1, 1078 uint32_t startInstance = 0) 1079 { 1080 if (KNOB_TOSS_DRAW) 1081 { 1082 return; 1083 } 1084 1085 SWR_CONTEXT *pContext = GetContext(hContext); 1086 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1087 1088 AR_API_BEGIN(APIDraw, pDC->drawId); 1089 AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertices, startVertex, numInstances, startInstance)); 1090 1091 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); 1092 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); 1093 uint32_t remainingVerts = numVertices; 1094 1095 API_STATE *pState = &pDC->pState->state; 1096 pState->topology = topology; 1097 pState->forceFront = false; 1098 1099 // disable culling for points/lines 1100 uint32_t oldCullMode = pState->rastState.cullMode; 1101 if (topology == TOP_POINT_LIST) 1102 { 1103 pState->rastState.cullMode = SWR_CULLMODE_NONE; 1104 pState->forceFront = true; 1105 } 1106 else if (topology == TOP_RECT_LIST) 1107 { 1108 pState->rastState.cullMode = SWR_CULLMODE_NONE; 1109 } 1110 1111 1112 int draw = 0; 1113 while (remainingVerts) 1114 { 1115 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ? 1116 remainingVerts : maxVertsPerDraw; 1117 1118 bool isSplitDraw = (draw > 0) ? true : false; 1119 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); 1120 InitDraw(pDC, isSplitDraw); 1121 1122 pDC->FeWork.type = DRAW; 1123 pDC->FeWork.pfnWork = GetProcessDrawFunc( 1124 false, // IsIndexed 1125 false, // bEnableCutIndex 1126 pState->tsState.tsEnable, 1127 pState->gsState.gsEnable, 1128 pState->soState.soEnable, 1129 pDC->pState->pfnProcessPrims != nullptr); 1130 pDC->FeWork.desc.draw.numVerts = numVertsForDraw; 1131 pDC->FeWork.desc.draw.startVertex = startVertex; 1132 pDC->FeWork.desc.draw.numInstances = numInstances; 1133 pDC->FeWork.desc.draw.startInstance = startInstance; 1134 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; 1135 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw; 1136 1137 pDC->cleanupState = (remainingVerts == numVertsForDraw); 1138 1139 //enqueue DC 1140 QueueDraw(pContext); 1141 1142 remainingVerts -= numVertsForDraw; 1143 draw++; 1144 } 1145 1146 // restore culling state 1147 pDC = GetDrawContext(pContext); 1148 pDC->pState->state.rastState.cullMode = oldCullMode; 1149 1150 1151 AR_API_END(APIDraw, numVertices * numInstances); 1152 } 1153 1154 ////////////////////////////////////////////////////////////////////////// 1155 /// @brief SwrDraw 1156 /// @param hContext - Handle passed back from SwrCreateContext 1157 /// @param topology - Specifies topology for draw. 1158 /// @param startVertex - Specifies start vertex in vertex buffer for draw. 1159 /// @param primCount - Number of vertices. 1160 void SwrDraw( 1161 HANDLE hContext, 1162 PRIMITIVE_TOPOLOGY topology, 1163 uint32_t startVertex, 1164 uint32_t numVertices) 1165 { 1166 DrawInstanced(hContext, topology, numVertices, startVertex); 1167 } 1168 1169 ////////////////////////////////////////////////////////////////////////// 1170 /// @brief SwrDrawInstanced 1171 /// @param hContext - Handle passed back from SwrCreateContext 1172 /// @param topology - Specifies topology for draw. 1173 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. 1174 /// @param numInstances - How many instances to render. 1175 /// @param startVertex - Specifies start vertex for draw. (vertex data) 1176 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) 1177 void SwrDrawInstanced( 1178 HANDLE hContext, 1179 PRIMITIVE_TOPOLOGY topology, 1180 uint32_t numVertsPerInstance, 1181 uint32_t numInstances, 1182 uint32_t startVertex, 1183 uint32_t startInstance 1184 ) 1185 { 1186 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance); 1187 } 1188 1189 ////////////////////////////////////////////////////////////////////////// 1190 /// @brief DrawIndexedInstanced 1191 /// @param hContext - Handle passed back from SwrCreateContext 1192 /// @param topology - Specifies topology for draw. 1193 /// @param numIndices - Number of indices to read sequentially from index buffer. 1194 /// @param indexOffset - Starting index into index buffer. 1195 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. 1196 /// @param numInstances - Number of instances to render. 1197 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) 1198 void DrawIndexedInstance( 1199 HANDLE hContext, 1200 PRIMITIVE_TOPOLOGY topology, 1201 uint32_t numIndices, 1202 uint32_t indexOffset, 1203 int32_t baseVertex, 1204 uint32_t numInstances = 1, 1205 uint32_t startInstance = 0) 1206 { 1207 if (KNOB_TOSS_DRAW) 1208 { 1209 return; 1210 } 1211 1212 SWR_CONTEXT *pContext = GetContext(hContext); 1213 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1214 API_STATE* pState = &pDC->pState->state; 1215 1216 AR_API_BEGIN(APIDrawIndexed, pDC->drawId); 1217 AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance)); 1218 1219 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); 1220 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); 1221 uint32_t remainingIndices = numIndices; 1222 1223 uint32_t indexSize = 0; 1224 switch (pState->indexBuffer.format) 1225 { 1226 case R32_UINT: indexSize = sizeof(uint32_t); break; 1227 case R16_UINT: indexSize = sizeof(uint16_t); break; 1228 case R8_UINT: indexSize = sizeof(uint8_t); break; 1229 default: 1230 SWR_ASSERT(0); 1231 } 1232 1233 int draw = 0; 1234 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices; 1235 pIB += (uint64_t)indexOffset * (uint64_t)indexSize; 1236 1237 pState->topology = topology; 1238 pState->forceFront = false; 1239 1240 // disable culling for points/lines 1241 uint32_t oldCullMode = pState->rastState.cullMode; 1242 if (topology == TOP_POINT_LIST) 1243 { 1244 pState->rastState.cullMode = SWR_CULLMODE_NONE; 1245 pState->forceFront = true; 1246 } 1247 else if (topology == TOP_RECT_LIST) 1248 { 1249 pState->rastState.cullMode = SWR_CULLMODE_NONE; 1250 } 1251 1252 1253 while (remainingIndices) 1254 { 1255 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? 1256 remainingIndices : maxIndicesPerDraw; 1257 1258 // When breaking up draw, we need to obtain new draw context for each iteration. 1259 bool isSplitDraw = (draw > 0) ? true : false; 1260 1261 pDC = GetDrawContext(pContext, isSplitDraw); 1262 InitDraw(pDC, isSplitDraw); 1263 1264 pDC->FeWork.type = DRAW; 1265 pDC->FeWork.pfnWork = GetProcessDrawFunc( 1266 true, // IsIndexed 1267 pState->frontendState.bEnableCutIndex, 1268 pState->tsState.tsEnable, 1269 pState->gsState.gsEnable, 1270 pState->soState.soEnable, 1271 pDC->pState->pfnProcessPrims != nullptr); 1272 pDC->FeWork.desc.draw.pDC = pDC; 1273 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; 1274 pDC->FeWork.desc.draw.pIB = (int*)pIB; 1275 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; 1276 1277 pDC->FeWork.desc.draw.numInstances = numInstances; 1278 pDC->FeWork.desc.draw.startInstance = startInstance; 1279 pDC->FeWork.desc.draw.baseVertex = baseVertex; 1280 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; 1281 1282 pDC->cleanupState = (remainingIndices == numIndicesForDraw); 1283 1284 //enqueue DC 1285 QueueDraw(pContext); 1286 1287 pIB += maxIndicesPerDraw * indexSize; 1288 remainingIndices -= numIndicesForDraw; 1289 draw++; 1290 } 1291 1292 // Restore culling state 1293 pDC = GetDrawContext(pContext); 1294 pDC->pState->state.rastState.cullMode = oldCullMode; 1295 1296 1297 AR_API_END(APIDrawIndexed, numIndices * numInstances); 1298 } 1299 1300 1301 ////////////////////////////////////////////////////////////////////////// 1302 /// @brief DrawIndexed 1303 /// @param hContext - Handle passed back from SwrCreateContext 1304 /// @param topology - Specifies topology for draw. 1305 /// @param numIndices - Number of indices to read sequentially from index buffer. 1306 /// @param indexOffset - Starting index into index buffer. 1307 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. 1308 void SwrDrawIndexed( 1309 HANDLE hContext, 1310 PRIMITIVE_TOPOLOGY topology, 1311 uint32_t numIndices, 1312 uint32_t indexOffset, 1313 int32_t baseVertex 1314 ) 1315 { 1316 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex); 1317 } 1318 1319 ////////////////////////////////////////////////////////////////////////// 1320 /// @brief SwrDrawIndexedInstanced 1321 /// @param hContext - Handle passed back from SwrCreateContext 1322 /// @param topology - Specifies topology for draw. 1323 /// @param numIndices - Number of indices to read sequentially from index buffer. 1324 /// @param numInstances - Number of instances to render. 1325 /// @param indexOffset - Starting index into index buffer. 1326 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. 1327 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) 1328 void SwrDrawIndexedInstanced( 1329 HANDLE hContext, 1330 PRIMITIVE_TOPOLOGY topology, 1331 uint32_t numIndices, 1332 uint32_t numInstances, 1333 uint32_t indexOffset, 1334 int32_t baseVertex, 1335 uint32_t startInstance) 1336 { 1337 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); 1338 } 1339 1340 ////////////////////////////////////////////////////////////////////////// 1341 /// @brief SwrInvalidateTiles 1342 /// @param hContext - Handle passed back from SwrCreateContext 1343 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. 1344 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to 1345 /// be hottile size-aligned. 1346 void SWR_API SwrInvalidateTiles( 1347 HANDLE hContext, 1348 uint32_t attachmentMask, 1349 const SWR_RECT& invalidateRect) 1350 { 1351 if (KNOB_TOSS_DRAW) 1352 { 1353 return; 1354 } 1355 1356 SWR_CONTEXT *pContext = GetContext(hContext); 1357 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1358 1359 pDC->FeWork.type = DISCARDINVALIDATETILES; 1360 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; 1361 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; 1362 pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect; 1363 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; 1364 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID; 1365 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false; 1366 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false; 1367 1368 //enqueue 1369 QueueDraw(pContext); 1370 } 1371 1372 ////////////////////////////////////////////////////////////////////////// 1373 /// @brief SwrDiscardRect 1374 /// @param hContext - Handle passed back from SwrCreateContext 1375 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. 1376 /// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be 1377 /// discarded. 1378 void SWR_API SwrDiscardRect( 1379 HANDLE hContext, 1380 uint32_t attachmentMask, 1381 const SWR_RECT& rect) 1382 { 1383 if (KNOB_TOSS_DRAW) 1384 { 1385 return; 1386 } 1387 1388 SWR_CONTEXT *pContext = GetContext(hContext); 1389 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1390 1391 // Queue a load to the hottile 1392 pDC->FeWork.type = DISCARDINVALIDATETILES; 1393 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; 1394 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; 1395 pDC->FeWork.desc.discardInvalidateTiles.rect = rect; 1396 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; 1397 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED; 1398 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true; 1399 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true; 1400 1401 //enqueue 1402 QueueDraw(pContext); 1403 } 1404 1405 ////////////////////////////////////////////////////////////////////////// 1406 /// @brief SwrDispatch 1407 /// @param hContext - Handle passed back from SwrCreateContext 1408 /// @param threadGroupCountX - Number of thread groups dispatched in X direction 1409 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction 1410 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction 1411 void SwrDispatch( 1412 HANDLE hContext, 1413 uint32_t threadGroupCountX, 1414 uint32_t threadGroupCountY, 1415 uint32_t threadGroupCountZ) 1416 { 1417 if (KNOB_TOSS_DRAW) 1418 { 1419 return; 1420 } 1421 1422 SWR_CONTEXT *pContext = GetContext(hContext); 1423 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1424 1425 AR_API_BEGIN(APIDispatch, pDC->drawId); 1426 AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ)); 1427 pDC->isCompute = true; // This is a compute context. 1428 1429 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); 1430 1431 pTaskData->threadGroupCountX = threadGroupCountX; 1432 pTaskData->threadGroupCountY = threadGroupCountY; 1433 pTaskData->threadGroupCountZ = threadGroupCountZ; 1434 1435 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; 1436 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; 1437 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; 1438 pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE); 1439 1440 QueueDispatch(pContext); 1441 AR_API_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ); 1442 } 1443 1444 // Deswizzles, converts and stores current contents of the hot tiles to surface 1445 // described by pState 1446 void SWR_API SwrStoreTiles( 1447 HANDLE hContext, 1448 uint32_t attachmentMask, 1449 SWR_TILE_STATE postStoreTileState, 1450 const SWR_RECT& storeRect) 1451 { 1452 if (KNOB_TOSS_DRAW) 1453 { 1454 return; 1455 } 1456 1457 SWR_CONTEXT *pContext = GetContext(hContext); 1458 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1459 1460 AR_API_BEGIN(APIStoreTiles, pDC->drawId); 1461 1462 pDC->FeWork.type = STORETILES; 1463 pDC->FeWork.pfnWork = ProcessStoreTiles; 1464 pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask; 1465 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; 1466 pDC->FeWork.desc.storeTiles.rect = storeRect; 1467 pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect; 1468 1469 //enqueue 1470 QueueDraw(pContext); 1471 1472 AR_API_END(APIStoreTiles, 1); 1473 } 1474 1475 ////////////////////////////////////////////////////////////////////////// 1476 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil 1477 /// @param hContext - Handle passed back from SwrCreateContext 1478 /// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear 1479 /// @param renderTargetArrayIndex - the RT array index to clear 1480 /// @param clearColor - color use for clearing render targets 1481 /// @param z - depth value use for clearing depth buffer 1482 /// @param stencil - stencil value used for clearing stencil buffer 1483 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers 1484 void SWR_API SwrClearRenderTarget( 1485 HANDLE hContext, 1486 uint32_t attachmentMask, 1487 uint32_t renderTargetArrayIndex, 1488 const float clearColor[4], 1489 float z, 1490 uint8_t stencil, 1491 const SWR_RECT& clearRect) 1492 { 1493 if (KNOB_TOSS_DRAW) 1494 { 1495 return; 1496 } 1497 1498 SWR_CONTEXT *pContext = GetContext(hContext); 1499 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1500 1501 AR_API_BEGIN(APIClearRenderTarget, pDC->drawId); 1502 1503 pDC->FeWork.type = CLEAR; 1504 pDC->FeWork.pfnWork = ProcessClear; 1505 pDC->FeWork.desc.clear.rect = clearRect; 1506 pDC->FeWork.desc.clear.rect &= g_MaxScissorRect; 1507 pDC->FeWork.desc.clear.attachmentMask = attachmentMask; 1508 pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex; 1509 pDC->FeWork.desc.clear.clearDepth = z; 1510 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; 1511 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; 1512 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; 1513 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; 1514 pDC->FeWork.desc.clear.clearStencil = stencil; 1515 1516 // enqueue draw 1517 QueueDraw(pContext); 1518 1519 AR_API_END(APIClearRenderTarget, 1); 1520 } 1521 1522 ////////////////////////////////////////////////////////////////////////// 1523 /// @brief Returns a pointer to the private context state for the current 1524 /// draw operation. This is used for external componets such as the 1525 /// sampler. 1526 /// SWR is responsible for the allocation of the private context state. 1527 /// @param hContext - Handle passed back from SwrCreateContext 1528 VOID* SwrGetPrivateContextState( 1529 HANDLE hContext) 1530 { 1531 SWR_CONTEXT* pContext = GetContext(hContext); 1532 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1533 DRAW_STATE* pState = pDC->pState; 1534 1535 if (pState->pPrivateState == nullptr) 1536 { 1537 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); 1538 } 1539 1540 return pState->pPrivateState; 1541 } 1542 1543 ////////////////////////////////////////////////////////////////////////// 1544 /// @brief Clients can use this to allocate memory for draw/dispatch 1545 /// operations. The memory will automatically be freed once operation 1546 /// has completed. Client can use this to allocate binding tables, 1547 /// etc. needed for shader execution. 1548 /// @param hContext - Handle passed back from SwrCreateContext 1549 /// @param size - Size of allocation 1550 /// @param align - Alignment needed for allocation. 1551 VOID* SwrAllocDrawContextMemory( 1552 HANDLE hContext, 1553 uint32_t size, 1554 uint32_t align) 1555 { 1556 SWR_CONTEXT* pContext = GetContext(hContext); 1557 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1558 1559 return pDC->pState->pArena->AllocAligned(size, align); 1560 } 1561 1562 ////////////////////////////////////////////////////////////////////////// 1563 /// @brief Enables stats counting 1564 /// @param hContext - Handle passed back from SwrCreateContext 1565 /// @param enable - If true then counts are incremented. 1566 void SwrEnableStatsFE( 1567 HANDLE hContext, 1568 bool enable) 1569 { 1570 SWR_CONTEXT *pContext = GetContext(hContext); 1571 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1572 1573 pDC->pState->state.enableStatsFE = enable; 1574 } 1575 1576 ////////////////////////////////////////////////////////////////////////// 1577 /// @brief Enables stats counting 1578 /// @param hContext - Handle passed back from SwrCreateContext 1579 /// @param enable - If true then counts are incremented. 1580 void SwrEnableStatsBE( 1581 HANDLE hContext, 1582 bool enable) 1583 { 1584 SWR_CONTEXT *pContext = GetContext(hContext); 1585 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1586 1587 pDC->pState->state.enableStatsBE = enable; 1588 } 1589 1590 ////////////////////////////////////////////////////////////////////////// 1591 /// @brief Mark end of frame - used for performance profiling 1592 /// @param hContext - Handle passed back from SwrCreateContext 1593 void SWR_API SwrEndFrame( 1594 HANDLE hContext) 1595 { 1596 SWR_CONTEXT *pContext = GetContext(hContext); 1597 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1598 1599 RDTSC_ENDFRAME(); 1600 AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId)); 1601 1602 pContext->frameCount++; 1603 } 1604 1605