Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file api.cpp
     24 *
     25 * @brief API implementation
     26 *
     27 ******************************************************************************/
     28 
     29 #include <cfloat>
     30 #include <cmath>
     31 #include <cstdio>
     32 #include <new>
     33 
     34 #include "core/api.h"
     35 #include "core/backend.h"
     36 #include "core/context.h"
     37 #include "core/depthstencil.h"
     38 #include "core/frontend.h"
     39 #include "core/rasterizer.h"
     40 #include "core/rdtsc_core.h"
     41 #include "core/threads.h"
     42 #include "core/tilemgr.h"
     43 #include "core/clip.h"
     44 #include "core/utils.h"
     45 
     46 #include "common/os.h"
     47 
     48 static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y };
     49 
     50 void SetupDefaultState(SWR_CONTEXT *pContext);
     51 
     52 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
     53 {
     54     return (SWR_CONTEXT*)hContext;
     55 }
     56 
     57 void WakeAllThreads(SWR_CONTEXT *pContext)
     58 {
     59     pContext->FifosNotEmpty.notify_all();
     60 }
     61 
     62 //////////////////////////////////////////////////////////////////////////
     63 /// @brief Create SWR Context.
     64 /// @param pCreateInfo - pointer to creation info.
     65 HANDLE SwrCreateContext(
     66     SWR_CREATECONTEXT_INFO* pCreateInfo)
     67 {
     68     RDTSC_RESET();
     69     RDTSC_INIT(0);
     70 
     71     void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
     72     memset(pContextMem, 0, sizeof(SWR_CONTEXT));
     73     SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
     74 
     75     pContext->privateStateSize = pCreateInfo->privateStateSize;
     76 
     77     pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
     78     if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
     79     {
     80         pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
     81     }
     82 
     83     pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
     84     pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
     85 
     86     pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
     87     pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
     88 
     89     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
     90     {
     91         pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     92         new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
     93         new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
     94 
     95         pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     96     }
     97 
     98     if (pCreateInfo->pThreadInfo)
     99     {
    100         pContext->threadInfo = *pCreateInfo->pThreadInfo;
    101     }
    102     else
    103     {
    104         pContext->threadInfo.MAX_WORKER_THREADS         = KNOB_MAX_WORKER_THREADS;
    105         pContext->threadInfo.BASE_NUMA_NODE             = KNOB_BASE_NUMA_NODE;
    106         pContext->threadInfo.BASE_CORE                  = KNOB_BASE_CORE;
    107         pContext->threadInfo.BASE_THREAD                = KNOB_BASE_THREAD;
    108         pContext->threadInfo.MAX_NUMA_NODES             = KNOB_MAX_NUMA_NODES;
    109         pContext->threadInfo.MAX_CORES_PER_NUMA_NODE    = KNOB_MAX_CORES_PER_NUMA_NODE;
    110         pContext->threadInfo.MAX_THREADS_PER_CORE       = KNOB_MAX_THREADS_PER_CORE;
    111         pContext->threadInfo.SINGLE_THREADED            = KNOB_SINGLE_THREADED;
    112     }
    113 
    114     if (pCreateInfo->pApiThreadInfo)
    115     {
    116         pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
    117     }
    118     else
    119     {
    120         pContext->apiThreadInfo.bindAPIThread0          = true;
    121         pContext->apiThreadInfo.numAPIReservedThreads   = 1;
    122         pContext->apiThreadInfo.numAPIThreadsPerCore    = 1;
    123     }
    124 
    125     memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
    126     memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
    127     new (&pContext->WaitLock) std::mutex();
    128     new (&pContext->FifosNotEmpty) std::condition_variable();
    129 
    130     CreateThreadPool(pContext, &pContext->threadPool);
    131 
    132     if (pContext->apiThreadInfo.bindAPIThread0)
    133     {
    134         BindApiThread(pContext, 0);
    135     }
    136 
    137     pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
    138     pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
    139 
    140 #if defined(KNOB_ENABLE_AR)
    141     // Setup ArchRast thread contexts which includes +1 for API thread.
    142     pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1];
    143     pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
    144 #endif
    145 
    146     // Allocate scratch space for workers.
    147     ///@note We could lazily allocate this but its rather small amount of memory.
    148     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
    149     {
    150 #if defined(_WIN32)
    151         uint32_t numaNode = pContext->threadPool.pThreadData ?
    152             pContext->threadPool.pThreadData[i].numaId : 0;
    153         pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(
    154             GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
    155             MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
    156             numaNode);
    157 #else
    158         pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
    159 #endif
    160 
    161 #if defined(KNOB_ENABLE_AR)
    162         // Initialize worker thread context for ArchRast.
    163         pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
    164 #endif
    165     }
    166 
    167 #if defined(KNOB_ENABLE_AR)
    168     // cache the API thread event manager, for use with sim layer
    169     pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads + 1];
    170 #endif
    171 
    172     // State setup AFTER context is fully initialized
    173     SetupDefaultState(pContext);
    174 
    175     // initialize hot tile manager
    176     pContext->pHotTileMgr = new HotTileMgr();
    177 
    178     // initialize callback functions
    179     pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
    180     pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
    181     pContext->pfnClearTile = pCreateInfo->pfnClearTile;
    182     pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
    183     pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
    184     pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
    185 
    186 
    187     // pass pointer to bucket manager back to caller
    188 #ifdef KNOB_ENABLE_RDTSC
    189     pCreateInfo->pBucketMgr = &gBucketMgr;
    190 #endif
    191 
    192     pCreateInfo->contextSaveSize = sizeof(API_STATE);
    193 
    194     StartThreadPool(pContext, &pContext->threadPool);
    195 
    196     return (HANDLE)pContext;
    197 }
    198 
    199 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
    200 {
    201     memcpy(&dst.state, &src.state, sizeof(API_STATE));
    202 }
    203 
    204 template<bool IsDraw>
    205 void QueueWork(SWR_CONTEXT *pContext)
    206 {
    207     DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
    208     uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
    209 
    210     if (IsDraw)
    211     {
    212         pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
    213         pDC->pTileMgr->initialize();
    214     }
    215 
    216     // Each worker thread looks at a DC for both FE and BE work at different times and so we
    217     // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
    218     // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
    219     // then moved on if all work is done.)
    220     pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
    221 
    222     if (IsDraw)
    223     {
    224         InterlockedIncrement(&pContext->drawsOutstandingFE);
    225     }
    226 
    227     _ReadWriteBarrier();
    228     {
    229         std::unique_lock<std::mutex> lock(pContext->WaitLock);
    230         pContext->dcRing.Enqueue();
    231     }
    232 
    233     if (pContext->threadInfo.SINGLE_THREADED)
    234     {
    235         // flush denormals to 0
    236         uint32_t mxcsr = _mm_getcsr();
    237         _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
    238 
    239         if (IsDraw)
    240         {
    241             uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
    242             WorkOnFifoFE(pContext, 0, curDraw[0]);
    243             WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
    244         }
    245         else
    246         {
    247             uint32_t curDispatch = pContext->pCurDrawContext->drawId;
    248             WorkOnCompute(pContext, 0, curDispatch);
    249         }
    250 
    251         // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
    252         while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
    253 
    254         // restore csr
    255         _mm_setcsr(mxcsr);
    256     }
    257     else
    258     {
    259         AR_API_BEGIN(APIDrawWakeAllThreads, pDC->drawId);
    260         WakeAllThreads(pContext);
    261         AR_API_END(APIDrawWakeAllThreads, 1);
    262     }
    263 
    264     // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
    265     pContext->pPrevDrawContext = pContext->pCurDrawContext;
    266     pContext->pCurDrawContext = nullptr;
    267 }
    268 
    269 INLINE void QueueDraw(SWR_CONTEXT* pContext)
    270 {
    271     QueueWork<true>(pContext);
    272 }
    273 
    274 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
    275 {
    276     QueueWork<false>(pContext);
    277 }
    278 
    279 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
    280 {
    281     AR_API_BEGIN(APIGetDrawContext, 0);
    282     // If current draw context is null then need to obtain a new draw context to use from ring.
    283     if (pContext->pCurDrawContext == nullptr)
    284     {
    285         // Need to wait for a free entry.
    286         while (pContext->dcRing.IsFull())
    287         {
    288             _mm_pause();
    289         }
    290 
    291         uint64_t curDraw = pContext->dcRing.GetHead();
    292         uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
    293 
    294         if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
    295             (curDraw - pContext->lastDrawChecked) > 0x10000)
    296         {
    297             // Take this opportunity to clean-up old arena allocations
    298             pContext->cachingArenaAllocator.FreeOldBlocks();
    299 
    300             pContext->lastFrameChecked = pContext->frameCount;
    301             pContext->lastDrawChecked = curDraw;
    302         }
    303 
    304         DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
    305         pContext->pCurDrawContext = pCurDrawContext;
    306 
    307         // Assign next available entry in DS ring to this DC.
    308         uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
    309         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
    310 
    311         // Copy previous state to current state.
    312         if (pContext->pPrevDrawContext)
    313         {
    314             DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
    315 
    316             // If we're splitting our draw then we can just use the same state from the previous
    317             // draw. In this case, we won't increment the DS ring index so the next non-split
    318             // draw can receive the state.
    319             if (isSplitDraw == false)
    320             {
    321                 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
    322 
    323                 // Should have been cleaned up previously
    324                 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
    325 
    326                 pCurDrawContext->pState->pPrivateState = nullptr;
    327 
    328                 pContext->curStateId++;  // Progress state ring index forward.
    329             }
    330             else
    331             {
    332                 // If its a split draw then just copy the state pointer over
    333                 // since its the same draw.
    334                 pCurDrawContext->pState = pPrevDrawContext->pState;
    335                 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
    336             }
    337         }
    338         else
    339         {
    340             SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
    341             pContext->curStateId++;  // Progress state ring index forward.
    342         }
    343 
    344         SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
    345 
    346         // Reset dependency
    347         pCurDrawContext->dependent = false;
    348         pCurDrawContext->dependentFE = false;
    349 
    350         pCurDrawContext->pContext = pContext;
    351         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
    352 
    353         pCurDrawContext->doneFE = false;
    354         pCurDrawContext->FeLock = 0;
    355         pCurDrawContext->threadsDone = 0;
    356         pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
    357 
    358         pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads);
    359 
    360         // Assign unique drawId for this DC
    361         pCurDrawContext->drawId = pContext->dcRing.GetHead();
    362 
    363         pCurDrawContext->cleanupState = true;
    364     }
    365     else
    366     {
    367         SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
    368     }
    369 
    370     AR_API_END(APIGetDrawContext, 0);
    371     return pContext->pCurDrawContext;
    372 }
    373 
    374 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
    375 {
    376     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
    377     SWR_ASSERT(pDC->pState != nullptr);
    378 
    379     return &pDC->pState->state;
    380 }
    381 
    382 void SwrDestroyContext(HANDLE hContext)
    383 {
    384     SWR_CONTEXT *pContext = GetContext(hContext);
    385     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
    386 
    387     pDC->FeWork.type = SHUTDOWN;
    388     pDC->FeWork.pfnWork = ProcessShutdown;
    389 
    390     //enqueue
    391     QueueDraw(pContext);
    392 
    393     DestroyThreadPool(pContext, &pContext->threadPool);
    394 
    395     // free the fifos
    396     for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
    397     {
    398         AlignedFree(pContext->dcRing[i].dynState.pStats);
    399         delete pContext->dcRing[i].pArena;
    400         delete pContext->dsRing[i].pArena;
    401         pContext->pMacroTileManagerArray[i].~MacroTileMgr();
    402         pContext->pDispatchQueueArray[i].~DispatchQueue();
    403     }
    404 
    405     AlignedFree(pContext->pDispatchQueueArray);
    406     AlignedFree(pContext->pMacroTileManagerArray);
    407 
    408     // Free scratch space.
    409     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
    410     {
    411 #if defined(_WIN32)
    412         VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
    413 #else
    414         AlignedFree(pContext->ppScratch[i]);
    415 #endif
    416 
    417 #if defined(KNOB_ENABLE_AR)
    418         ArchRast::DestroyThreadContext(pContext->pArContext[i]);
    419 #endif
    420     }
    421 
    422     delete[] pContext->ppScratch;
    423     AlignedFree(pContext->pStats);
    424 
    425     delete(pContext->pHotTileMgr);
    426 
    427     pContext->~SWR_CONTEXT();
    428     AlignedFree(GetContext(hContext));
    429 }
    430 
    431 void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
    432 {
    433     SWR_CONTEXT *pContext = GetContext(hContext);
    434     BindApiThread(pContext, apiThreadId);
    435 }
    436 
    437 void SWR_API SwrSaveState(
    438     HANDLE hContext,
    439     void* pOutputStateBlock,
    440     size_t memSize)
    441 {
    442     SWR_CONTEXT *pContext = GetContext(hContext);
    443     auto pSrc = GetDrawState(pContext);
    444     SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
    445 
    446     memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
    447 }
    448 
    449 void SWR_API SwrRestoreState(
    450     HANDLE hContext,
    451     const void* pStateBlock,
    452     size_t memSize)
    453 {
    454     SWR_CONTEXT *pContext = GetContext(hContext);
    455     auto pDst = GetDrawState(pContext);
    456     SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
    457 
    458     memcpy(pDst, pStateBlock, sizeof(*pDst));
    459 }
    460 
    461 void SetupDefaultState(SWR_CONTEXT *pContext)
    462 {
    463     API_STATE* pState = GetDrawState(pContext);
    464 
    465     pState->rastState.cullMode = SWR_CULLMODE_NONE;
    466     pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
    467 
    468     pState->depthBoundsState.depthBoundsTestEnable = false;
    469     pState->depthBoundsState.depthBoundsTestMinValue = 0.0f;
    470     pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
    471 }
    472 
    473 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
    474 {
    475     SWR_ASSERT(pfnFunc != nullptr);
    476 
    477     SWR_CONTEXT *pContext = GetContext(hContext);
    478     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
    479 
    480     AR_API_BEGIN(APISync, 0);
    481 
    482     pDC->FeWork.type = SYNC;
    483     pDC->FeWork.pfnWork = ProcessSync;
    484 
    485     // Setup callback function
    486     pDC->retireCallback.pfnCallbackFunc = pfnFunc;
    487     pDC->retireCallback.userData = userData;
    488     pDC->retireCallback.userData2 = userData2;
    489     pDC->retireCallback.userData3 = userData3;
    490 
    491     AR_API_EVENT(SwrSyncEvent(pDC->drawId));
    492 
    493     //enqueue
    494     QueueDraw(pContext);
    495 
    496     AR_API_END(APISync, 1);
    497 }
    498 
    499 void SwrStallBE(HANDLE hContext)
    500 {
    501     SWR_CONTEXT* pContext = GetContext(hContext);
    502     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
    503 
    504     pDC->dependent = true;
    505 }
    506 
    507 void SwrWaitForIdle(HANDLE hContext)
    508 {
    509     SWR_CONTEXT *pContext = GetContext(hContext);
    510 
    511     AR_API_BEGIN(APIWaitForIdle, 0);
    512 
    513     while (!pContext->dcRing.IsEmpty())
    514     {
    515         _mm_pause();
    516     }
    517 
    518     AR_API_END(APIWaitForIdle, 1);
    519 }
    520 
    521 void SwrWaitForIdleFE(HANDLE hContext)
    522 {
    523     SWR_CONTEXT *pContext = GetContext(hContext);
    524 
    525     AR_API_BEGIN(APIWaitForIdle, 0);
    526 
    527     while (pContext->drawsOutstandingFE > 0)
    528     {
    529         _mm_pause();
    530     }
    531 
    532     AR_API_END(APIWaitForIdle, 1);
    533 }
    534 
    535 void SwrSetVertexBuffers(
    536     HANDLE hContext,
    537     uint32_t numBuffers,
    538     const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
    539 {
    540     API_STATE* pState = GetDrawState(GetContext(hContext));
    541 
    542     for (uint32_t i = 0; i < numBuffers; ++i)
    543     {
    544         const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
    545         pState->vertexBuffers[pVB->index] = *pVB;
    546     }
    547 }
    548 
    549 void SwrSetIndexBuffer(
    550     HANDLE hContext,
    551     const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
    552 {
    553     API_STATE* pState = GetDrawState(GetContext(hContext));
    554 
    555     pState->indexBuffer = *pIndexBuffer;
    556 }
    557 
    558 void SwrSetFetchFunc(
    559     HANDLE hContext,
    560     PFN_FETCH_FUNC    pfnFetchFunc)
    561 {
    562     API_STATE* pState = GetDrawState(GetContext(hContext));
    563 
    564     pState->pfnFetchFunc = pfnFetchFunc;
    565 }
    566 
    567 void SwrSetSoFunc(
    568     HANDLE hContext,
    569     PFN_SO_FUNC    pfnSoFunc,
    570     uint32_t streamIndex)
    571 {
    572     API_STATE* pState = GetDrawState(GetContext(hContext));
    573 
    574     SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
    575 
    576     pState->pfnSoFunc[streamIndex] = pfnSoFunc;
    577 }
    578 
    579 void SwrSetSoState(
    580     HANDLE hContext,
    581     SWR_STREAMOUT_STATE* pSoState)
    582 {
    583     API_STATE* pState = GetDrawState(GetContext(hContext));
    584 
    585     pState->soState = *pSoState;
    586 }
    587 
    588 void SwrSetSoBuffers(
    589     HANDLE hContext,
    590     SWR_STREAMOUT_BUFFER* pSoBuffer,
    591     uint32_t slot)
    592 {
    593     API_STATE* pState = GetDrawState(GetContext(hContext));
    594 
    595     SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
    596 
    597     pState->soBuffer[slot] = *pSoBuffer;
    598 }
    599 
    600 void SwrSetVertexFunc(
    601     HANDLE hContext,
    602     PFN_VERTEX_FUNC pfnVertexFunc)
    603 {
    604     API_STATE* pState = GetDrawState(GetContext(hContext));
    605 
    606     pState->pfnVertexFunc = pfnVertexFunc;
    607 }
    608 
    609 void SwrSetFrontendState(
    610     HANDLE hContext,
    611     SWR_FRONTEND_STATE *pFEState)
    612 {
    613     API_STATE* pState = GetDrawState(GetContext(hContext));
    614     pState->frontendState = *pFEState;
    615 }
    616 
    617 void SwrSetGsState(
    618     HANDLE hContext,
    619     SWR_GS_STATE *pGSState)
    620 {
    621     API_STATE* pState = GetDrawState(GetContext(hContext));
    622     pState->gsState = *pGSState;
    623 }
    624 
    625 void SwrSetGsFunc(
    626     HANDLE hContext,
    627     PFN_GS_FUNC pfnGsFunc)
    628 {
    629     API_STATE* pState = GetDrawState(GetContext(hContext));
    630     pState->pfnGsFunc = pfnGsFunc;
    631 }
    632 
    633 void SwrSetCsFunc(
    634     HANDLE hContext,
    635     PFN_CS_FUNC pfnCsFunc,
    636     uint32_t totalThreadsInGroup,
    637     uint32_t totalSpillFillSize,
    638     uint32_t scratchSpaceSizePerInstance,
    639     uint32_t numInstances)
    640 {
    641     API_STATE* pState = GetDrawState(GetContext(hContext));
    642     pState->pfnCsFunc = pfnCsFunc;
    643     pState->totalThreadsInGroup = totalThreadsInGroup;
    644     pState->totalSpillFillSize = totalSpillFillSize;
    645     pState->scratchSpaceSize = scratchSpaceSizePerInstance;
    646     pState->scratchSpaceNumInstances = numInstances;
    647 }
    648 
    649 void SwrSetTsState(
    650     HANDLE hContext,
    651     SWR_TS_STATE *pState)
    652 {
    653     API_STATE* pApiState = GetDrawState(GetContext(hContext));
    654     pApiState->tsState = *pState;
    655 }
    656 
    657 void SwrSetHsFunc(
    658     HANDLE hContext,
    659     PFN_HS_FUNC pfnFunc)
    660 {
    661     API_STATE* pApiState = GetDrawState(GetContext(hContext));
    662     pApiState->pfnHsFunc = pfnFunc;
    663 }
    664 
    665 void SwrSetDsFunc(
    666     HANDLE hContext,
    667     PFN_DS_FUNC pfnFunc)
    668 {
    669     API_STATE* pApiState = GetDrawState(GetContext(hContext));
    670     pApiState->pfnDsFunc = pfnFunc;
    671 }
    672 
    673 void SwrSetDepthStencilState(
    674     HANDLE hContext,
    675     SWR_DEPTH_STENCIL_STATE *pDSState)
    676 {
    677     API_STATE* pState = GetDrawState(GetContext(hContext));
    678 
    679     pState->depthStencilState = *pDSState;
    680 }
    681 
    682 void SwrSetBackendState(
    683     HANDLE hContext,
    684     SWR_BACKEND_STATE *pBEState)
    685 {
    686     API_STATE* pState = GetDrawState(GetContext(hContext));
    687 
    688     pState->backendState = *pBEState;
    689 }
    690 
    691 void SwrSetDepthBoundsState(
    692     HANDLE hContext,
    693     SWR_DEPTH_BOUNDS_STATE *pDBState)
    694 {
    695     API_STATE* pState = GetDrawState(GetContext(hContext));
    696 
    697     pState->depthBoundsState = *pDBState;
    698 }
    699 
    700 void SwrSetPixelShaderState(
    701     HANDLE hContext,
    702     SWR_PS_STATE *pPSState)
    703 {
    704     API_STATE *pState = GetDrawState(GetContext(hContext));
    705     pState->psState = *pPSState;
    706 }
    707 
    708 void SwrSetBlendState(
    709     HANDLE hContext,
    710     SWR_BLEND_STATE *pBlendState)
    711 {
    712     API_STATE *pState = GetDrawState(GetContext(hContext));
    713     memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
    714 }
    715 
    716 void SwrSetBlendFunc(
    717     HANDLE hContext,
    718     uint32_t renderTarget,
    719     PFN_BLEND_JIT_FUNC pfnBlendFunc)
    720 {
    721     SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
    722     API_STATE *pState = GetDrawState(GetContext(hContext));
    723     pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
    724 }
    725 
    726 // update guardband multipliers for the viewport
    727 void updateGuardbands(API_STATE *pState)
    728 {
    729     uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
    730 
    731     for(uint32_t i = 0; i < numGbs; ++i)
    732     {
    733         // guardband center is viewport center
    734         pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
    735         pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
    736         pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
    737         pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
    738     }
    739 }
    740 
    741 void SwrSetRastState(
    742     HANDLE hContext,
    743     const SWR_RASTSTATE *pRastState)
    744 {
    745     SWR_CONTEXT *pContext = GetContext(hContext);
    746     API_STATE* pState = GetDrawState(pContext);
    747 
    748     memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
    749 }
    750 
    751 void SwrSetViewports(
    752     HANDLE hContext,
    753     uint32_t numViewports,
    754     const SWR_VIEWPORT* pViewports,
    755     const SWR_VIEWPORT_MATRICES* pMatrices)
    756 {
    757     SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
    758         "Invalid number of viewports.");
    759 
    760     SWR_CONTEXT *pContext = GetContext(hContext);
    761     API_STATE* pState = GetDrawState(pContext);
    762 
    763     memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
    764     // @todo Faster to copy portions of the SOA or just copy all of it?
    765     memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
    766 
    767     updateGuardbands(pState);
    768 }
    769 
    770 void SwrSetScissorRects(
    771     HANDLE hContext,
    772     uint32_t numScissors,
    773     const SWR_RECT* pScissors)
    774 {
    775     SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
    776         "Invalid number of scissor rects.");
    777 
    778     API_STATE* pState = GetDrawState(GetContext(hContext));
    779     memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0]));
    780 };
    781 
    782 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
    783 {
    784     API_STATE *pState = &pDC->pState->state;
    785     uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
    786     pState->scissorsTileAligned = true;
    787 
    788     for (uint32_t index = 0; index < numScissors; ++index)
    789     {
    790         SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index];
    791 
    792         // Set up scissor dimensions based on scissor or viewport
    793         if (pState->rastState.scissorEnable)
    794         {
    795             scissorInFixedPoint = pState->scissorRects[index];
    796         }
    797         else
    798         {
    799             // the vp width and height must be added to origin un-rounded then the result round to -inf.
    800             // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
    801             scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
    802             scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
    803             scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
    804             scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
    805         }
    806 
    807         // Clamp to max rect
    808         scissorInFixedPoint &= g_MaxScissorRect;
    809 
    810         // Test for tile alignment
    811         bool tileAligned;
    812         tileAligned  = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
    813         tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
    814         tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
    815         tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0;
    816 
    817         pState->scissorsTileAligned &= tileAligned;
    818 
    819         // Scale to fixed point
    820         scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
    821         scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
    822         scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
    823         scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
    824 
    825         // Make scissor inclusive
    826         scissorInFixedPoint.xmax -= 1;
    827         scissorInFixedPoint.ymax -= 1;
    828     }
    829 }
    830 
    831 
    832 // templated backend function tables
    833 
    834 void SetupPipeline(DRAW_CONTEXT *pDC)
    835 {
    836     DRAW_STATE* pState = pDC->pState;
    837     const SWR_RASTSTATE &rastState = pState->state.rastState;
    838     const SWR_PS_STATE &psState = pState->state.psState;
    839     BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
    840 
    841     // setup backend
    842     if (psState.pfnPixelShader == nullptr)
    843     {
    844         backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
    845     }
    846     else
    847     {
    848         const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
    849         const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
    850         const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
    851         const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
    852         SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
    853 
    854         // select backend function
    855         switch(psState.shadingRate)
    856         {
    857         case SWR_SHADING_RATE_PIXEL:
    858             if(bMultisampleEnable)
    859             {
    860                 // always need to generate I & J per sample for Z interpolation
    861                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
    862                 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern][psState.inputCoverage]
    863                                                                 [centroid][forcedSampleCount][canEarlyZ]
    864                     ;
    865             }
    866             else
    867             {
    868                 // always need to generate I & J per pixel for Z interpolation
    869                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
    870                 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
    871             }
    872             break;
    873         case SWR_SHADING_RATE_SAMPLE:
    874             SWR_ASSERT(rastState.bIsCenterPattern != true);
    875             // always need to generate I & J per sample for Z interpolation
    876             barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
    877             backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
    878             break;
    879         default:
    880             SWR_ASSERT(0 && "Invalid shading rate");
    881             break;
    882         }
    883     }
    884 
    885     SWR_ASSERT(backendFuncs.pfnBackend);
    886 
    887     PFN_PROCESS_PRIMS pfnBinner;
    888 #if USE_SIMD16_FRONTEND
    889     PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
    890 #endif
    891     switch (pState->state.topology)
    892     {
    893     case TOP_POINT_LIST:
    894         pState->pfnProcessPrims = ClipPoints;
    895         pfnBinner = BinPoints;
    896 #if USE_SIMD16_FRONTEND
    897         pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
    898         pfnBinner_simd16 = BinPoints_simd16;
    899 #endif
    900         break;
    901     case TOP_LINE_LIST:
    902     case TOP_LINE_STRIP:
    903     case TOP_LINE_LOOP:
    904     case TOP_LINE_LIST_ADJ:
    905     case TOP_LISTSTRIP_ADJ:
    906         pState->pfnProcessPrims = ClipLines;
    907         pfnBinner = BinLines;
    908 #if USE_SIMD16_FRONTEND
    909         pState->pfnProcessPrims_simd16 = ClipLines_simd16;
    910         pfnBinner_simd16 = BinLines_simd16;
    911 #endif
    912         break;
    913     default:
    914         pState->pfnProcessPrims = ClipTriangles;
    915         pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
    916 #if USE_SIMD16_FRONTEND
    917         pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
    918         pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
    919 #endif
    920         break;
    921     };
    922 
    923 
    924     // disable clipper if viewport transform is disabled
    925     if (pState->state.frontendState.vpTransformDisable)
    926     {
    927         pState->pfnProcessPrims = pfnBinner;
    928 #if USE_SIMD16_FRONTEND
    929         pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
    930 #endif
    931     }
    932 
    933     if ((pState->state.psState.pfnPixelShader == nullptr) &&
    934         (pState->state.depthStencilState.depthTestEnable == FALSE) &&
    935         (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
    936         (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
    937         (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
    938         (pState->state.backendState.numAttributes == 0))
    939     {
    940         pState->pfnProcessPrims = nullptr;
    941 #if USE_SIMD16_FRONTEND
    942         pState->pfnProcessPrims_simd16 = nullptr;
    943 #endif
    944     }
    945 
    946     if (pState->state.soState.rasterizerDisable == true)
    947     {
    948         pState->pfnProcessPrims = nullptr;
    949 #if USE_SIMD16_FRONTEND
    950         pState->pfnProcessPrims_simd16 = nullptr;
    951 #endif
    952     }
    953 
    954 
    955     // set up the frontend attribute count
    956     pState->state.feNumAttributes = 0;
    957     const SWR_BACKEND_STATE& backendState = pState->state.backendState;
    958     if (backendState.swizzleEnable)
    959     {
    960         // attribute swizzling is enabled, iterate over the map and record the max attribute used
    961         for (uint32_t i = 0; i < backendState.numAttributes; ++i)
    962         {
    963             pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
    964         }
    965     }
    966     else
    967     {
    968         pState->state.feNumAttributes = pState->state.backendState.numAttributes;
    969     }
    970 
    971     if (pState->state.soState.soEnable)
    972     {
    973         uint32_t streamMasks = 0;
    974         for (uint32_t i = 0; i < 4; ++i)
    975         {
    976             streamMasks |= pState->state.soState.streamMasks[i];
    977         }
    978 
    979         DWORD maxAttrib;
    980         if (_BitScanReverse(&maxAttrib, streamMasks))
    981         {
    982             pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
    983         }
    984     }
    985 
    986     // complicated logic to test for cases where we don't need backing hottile memory for a draw
    987     // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
    988     pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
    989                                            !pState->state.depthStencilState.depthWriteEnable &&
    990                                            !pState->state.depthBoundsState.depthBoundsTestEnable &&
    991                                            pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
    992                                         (pState->state.depthStencilState.depthTestEnable ||
    993                                          pState->state.depthStencilState.depthWriteEnable ||
    994                                          pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false;
    995 
    996     pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
    997                                              !pState->state.depthStencilState.stencilWriteEnable &&
    998                                               pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
    999                                           // for stencil we have to check the double sided state as well
   1000                                           (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
   1001                                              !pState->state.depthStencilState.stencilWriteEnable &&
   1002                                               pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
   1003                                           (pState->state.depthStencilState.stencilTestEnable  ||
   1004                                            pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
   1005 
   1006 
   1007     uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
   1008 
   1009     // Disable hottile for surfaces with no writes
   1010     if (psState.pfnPixelShader != nullptr)
   1011     {
   1012         DWORD rt;
   1013         uint32_t rtMask = pState->state.psState.renderTargetMask;
   1014         while (_BitScanForward(&rt, rtMask))
   1015         {
   1016             rtMask &= ~(1 << rt);
   1017 
   1018             if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
   1019                 pState->state.blendState.renderTarget[rt].writeDisableRed &&
   1020                 pState->state.blendState.renderTarget[rt].writeDisableGreen &&
   1021                 pState->state.blendState.renderTarget[rt].writeDisableBlue)
   1022             {
   1023                 hotTileEnable &= ~(1 << rt);
   1024             }
   1025         }
   1026     }
   1027 
   1028     pState->state.colorHottileEnable = hotTileEnable;
   1029 
   1030 
   1031     // Setup depth quantization function
   1032     if (pState->state.depthHottileEnable)
   1033     {
   1034         switch (pState->state.rastState.depthFormat)
   1035         {
   1036         case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
   1037         case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
   1038         case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
   1039         case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
   1040         default: SWR_INVALID("Unsupported depth format for depth quantiztion.");
   1041             pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
   1042         }
   1043     }
   1044     else
   1045     {
   1046         // set up pass-through quantize if depth isn't enabled
   1047         pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
   1048     }
   1049 }
   1050 
   1051 //////////////////////////////////////////////////////////////////////////
   1052 /// @brief InitDraw
   1053 /// @param pDC - Draw context to initialize for this draw.
   1054 void InitDraw(
   1055     DRAW_CONTEXT *pDC,
   1056     bool isSplitDraw)
   1057 {
   1058     // We don't need to re-setup the scissors/pipeline state again for split draw.
   1059     if (isSplitDraw == false)
   1060     {
   1061         SetupMacroTileScissors(pDC);
   1062         SetupPipeline(pDC);
   1063     }
   1064 
   1065 
   1066 }
   1067 
   1068 //////////////////////////////////////////////////////////////////////////
   1069 /// @brief We can split the draw for certain topologies for better performance.
   1070 /// @param totalVerts - Total vertices for draw
   1071 /// @param topology - Topology used for draw
   1072 uint32_t MaxVertsPerDraw(
   1073     DRAW_CONTEXT* pDC,
   1074     uint32_t totalVerts,
   1075     PRIMITIVE_TOPOLOGY topology)
   1076 {
   1077     API_STATE& state = pDC->pState->state;
   1078 
   1079     uint32_t vertsPerDraw = totalVerts;
   1080 
   1081     if (state.soState.soEnable)
   1082     {
   1083         return totalVerts;
   1084     }
   1085 
   1086     switch (topology)
   1087     {
   1088     case TOP_POINT_LIST:
   1089     case TOP_TRIANGLE_LIST:
   1090         vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
   1091         break;
   1092 
   1093     case TOP_PATCHLIST_1:
   1094     case TOP_PATCHLIST_2:
   1095     case TOP_PATCHLIST_3:
   1096     case TOP_PATCHLIST_4:
   1097     case TOP_PATCHLIST_5:
   1098     case TOP_PATCHLIST_6:
   1099     case TOP_PATCHLIST_7:
   1100     case TOP_PATCHLIST_8:
   1101     case TOP_PATCHLIST_9:
   1102     case TOP_PATCHLIST_10:
   1103     case TOP_PATCHLIST_11:
   1104     case TOP_PATCHLIST_12:
   1105     case TOP_PATCHLIST_13:
   1106     case TOP_PATCHLIST_14:
   1107     case TOP_PATCHLIST_15:
   1108     case TOP_PATCHLIST_16:
   1109     case TOP_PATCHLIST_17:
   1110     case TOP_PATCHLIST_18:
   1111     case TOP_PATCHLIST_19:
   1112     case TOP_PATCHLIST_20:
   1113     case TOP_PATCHLIST_21:
   1114     case TOP_PATCHLIST_22:
   1115     case TOP_PATCHLIST_23:
   1116     case TOP_PATCHLIST_24:
   1117     case TOP_PATCHLIST_25:
   1118     case TOP_PATCHLIST_26:
   1119     case TOP_PATCHLIST_27:
   1120     case TOP_PATCHLIST_28:
   1121     case TOP_PATCHLIST_29:
   1122     case TOP_PATCHLIST_30:
   1123     case TOP_PATCHLIST_31:
   1124     case TOP_PATCHLIST_32:
   1125         if (pDC->pState->state.tsState.tsEnable)
   1126         {
   1127             uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
   1128             vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
   1129         }
   1130         break;
   1131 
   1132     // The Primitive Assembly code can only handle 1 RECT at a time.
   1133     case TOP_RECT_LIST:
   1134         vertsPerDraw = 3;
   1135         break;
   1136 
   1137     default:
   1138         // We are not splitting up draws for other topologies.
   1139         break;
   1140     }
   1141 
   1142     return vertsPerDraw;
   1143 }
   1144 
   1145 
   1146 //////////////////////////////////////////////////////////////////////////
   1147 /// @brief DrawInstanced
   1148 /// @param hContext - Handle passed back from SwrCreateContext
   1149 /// @param topology - Specifies topology for draw.
   1150 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
   1151 /// @param startVertex - Specifies start vertex for draw. (vertex data)
   1152 /// @param numInstances - How many instances to render.
   1153 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
   1154 void DrawInstanced(
   1155     HANDLE hContext,
   1156     PRIMITIVE_TOPOLOGY topology,
   1157     uint32_t numVertices,
   1158     uint32_t startVertex,
   1159     uint32_t numInstances = 1,
   1160     uint32_t startInstance = 0)
   1161 {
   1162     if (KNOB_TOSS_DRAW)
   1163     {
   1164         return;
   1165     }
   1166 
   1167     SWR_CONTEXT *pContext = GetContext(hContext);
   1168     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1169 
   1170     AR_API_BEGIN(APIDraw, pDC->drawId);
   1171     AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertices, startVertex, numInstances, startInstance));
   1172 
   1173     uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
   1174     uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
   1175     uint32_t remainingVerts = numVertices;
   1176 
   1177     API_STATE    *pState = &pDC->pState->state;
   1178     pState->topology = topology;
   1179     pState->forceFront = false;
   1180 
   1181     // disable culling for points/lines
   1182     uint32_t oldCullMode = pState->rastState.cullMode;
   1183     if (topology == TOP_POINT_LIST)
   1184     {
   1185         pState->rastState.cullMode = SWR_CULLMODE_NONE;
   1186         pState->forceFront = true;
   1187     }
   1188     else if (topology == TOP_RECT_LIST)
   1189     {
   1190         pState->rastState.cullMode = SWR_CULLMODE_NONE;
   1191     }
   1192 
   1193     int draw = 0;
   1194     while (remainingVerts)
   1195     {
   1196         uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
   1197         remainingVerts : maxVertsPerDraw;
   1198 
   1199         bool isSplitDraw = (draw > 0) ? true : false;
   1200         DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
   1201         InitDraw(pDC, isSplitDraw);
   1202 
   1203         pDC->FeWork.type = DRAW;
   1204         pDC->FeWork.pfnWork = GetProcessDrawFunc(
   1205             false,  // IsIndexed
   1206             false, // bEnableCutIndex
   1207             pState->tsState.tsEnable,
   1208             pState->gsState.gsEnable,
   1209             pState->soState.soEnable,
   1210             pDC->pState->pfnProcessPrims != nullptr);
   1211         pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
   1212         pDC->FeWork.desc.draw.startVertex = startVertex;
   1213         pDC->FeWork.desc.draw.numInstances = numInstances;
   1214         pDC->FeWork.desc.draw.startInstance = startInstance;
   1215         pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
   1216         pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
   1217 
   1218         pDC->cleanupState = (remainingVerts == numVertsForDraw);
   1219 
   1220         //enqueue DC
   1221         QueueDraw(pContext);
   1222 
   1223         AR_API_EVENT(DrawInstancedSplitEvent(pDC->drawId));
   1224 
   1225         remainingVerts -= numVertsForDraw;
   1226         draw++;
   1227     }
   1228 
   1229     // restore culling state
   1230     pDC = GetDrawContext(pContext);
   1231     pDC->pState->state.rastState.cullMode = oldCullMode;
   1232 
   1233     AR_API_END(APIDraw, numVertices * numInstances);
   1234 }
   1235 
   1236 //////////////////////////////////////////////////////////////////////////
   1237 /// @brief SwrDraw
   1238 /// @param hContext - Handle passed back from SwrCreateContext
   1239 /// @param topology - Specifies topology for draw.
   1240 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
   1241 /// @param primCount - Number of vertices.
   1242 void SwrDraw(
   1243     HANDLE hContext,
   1244     PRIMITIVE_TOPOLOGY topology,
   1245     uint32_t startVertex,
   1246     uint32_t numVertices)
   1247 {
   1248     DrawInstanced(hContext, topology, numVertices, startVertex);
   1249 }
   1250 
   1251 //////////////////////////////////////////////////////////////////////////
   1252 /// @brief SwrDrawInstanced
   1253 /// @param hContext - Handle passed back from SwrCreateContext
   1254 /// @param topology - Specifies topology for draw.
   1255 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
   1256 /// @param numInstances - How many instances to render.
   1257 /// @param startVertex - Specifies start vertex for draw. (vertex data)
   1258 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
   1259 void SwrDrawInstanced(
   1260     HANDLE hContext,
   1261     PRIMITIVE_TOPOLOGY topology,
   1262     uint32_t numVertsPerInstance,
   1263     uint32_t numInstances,
   1264     uint32_t startVertex,
   1265     uint32_t startInstance
   1266     )
   1267 {
   1268     DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
   1269 }
   1270 
   1271 //////////////////////////////////////////////////////////////////////////
   1272 /// @brief DrawIndexedInstanced
   1273 /// @param hContext - Handle passed back from SwrCreateContext
   1274 /// @param topology - Specifies topology for draw.
   1275 /// @param numIndices - Number of indices to read sequentially from index buffer.
   1276 /// @param indexOffset - Starting index into index buffer.
   1277 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
   1278 /// @param numInstances - Number of instances to render.
   1279 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
   1280 void DrawIndexedInstance(
   1281     HANDLE hContext,
   1282     PRIMITIVE_TOPOLOGY topology,
   1283     uint32_t numIndices,
   1284     uint32_t indexOffset,
   1285     int32_t baseVertex,
   1286     uint32_t numInstances = 1,
   1287     uint32_t startInstance = 0)
   1288 {
   1289     if (KNOB_TOSS_DRAW)
   1290     {
   1291         return;
   1292     }
   1293 
   1294     SWR_CONTEXT *pContext = GetContext(hContext);
   1295     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1296     API_STATE* pState = &pDC->pState->state;
   1297 
   1298     AR_API_BEGIN(APIDrawIndexed, pDC->drawId);
   1299     AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance));
   1300 
   1301     uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
   1302     uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
   1303     uint32_t remainingIndices = numIndices;
   1304 
   1305     uint32_t indexSize = 0;
   1306     switch (pState->indexBuffer.format)
   1307     {
   1308     case R32_UINT: indexSize = sizeof(uint32_t); break;
   1309     case R16_UINT: indexSize = sizeof(uint16_t); break;
   1310     case R8_UINT: indexSize = sizeof(uint8_t); break;
   1311     default:
   1312         SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format);
   1313     }
   1314 
   1315     int draw = 0;
   1316     uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
   1317     pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
   1318 
   1319     pState->topology = topology;
   1320     pState->forceFront = false;
   1321 
   1322     // disable culling for points/lines
   1323     uint32_t oldCullMode = pState->rastState.cullMode;
   1324     if (topology == TOP_POINT_LIST)
   1325     {
   1326         pState->rastState.cullMode = SWR_CULLMODE_NONE;
   1327         pState->forceFront = true;
   1328     }
   1329     else if (topology == TOP_RECT_LIST)
   1330     {
   1331         pState->rastState.cullMode = SWR_CULLMODE_NONE;
   1332     }
   1333 
   1334     while (remainingIndices)
   1335     {
   1336         uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
   1337         remainingIndices : maxIndicesPerDraw;
   1338 
   1339         // When breaking up draw, we need to obtain new draw context for each iteration.
   1340         bool isSplitDraw = (draw > 0) ? true : false;
   1341 
   1342         pDC = GetDrawContext(pContext, isSplitDraw);
   1343         InitDraw(pDC, isSplitDraw);
   1344 
   1345         pDC->FeWork.type = DRAW;
   1346         pDC->FeWork.pfnWork = GetProcessDrawFunc(
   1347             true,   // IsIndexed
   1348             pState->frontendState.bEnableCutIndex,
   1349             pState->tsState.tsEnable,
   1350             pState->gsState.gsEnable,
   1351             pState->soState.soEnable,
   1352             pDC->pState->pfnProcessPrims != nullptr);
   1353         pDC->FeWork.desc.draw.pDC = pDC;
   1354         pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
   1355         pDC->FeWork.desc.draw.pIB = (int*)pIB;
   1356         pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
   1357 
   1358         pDC->FeWork.desc.draw.numInstances = numInstances;
   1359         pDC->FeWork.desc.draw.startInstance = startInstance;
   1360         pDC->FeWork.desc.draw.baseVertex = baseVertex;
   1361         pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
   1362 
   1363         pDC->cleanupState = (remainingIndices == numIndicesForDraw);
   1364 
   1365         //enqueue DC
   1366         QueueDraw(pContext);
   1367 
   1368         AR_API_EVENT(DrawIndexedInstancedSplitEvent(pDC->drawId));
   1369 
   1370         pIB += maxIndicesPerDraw * indexSize;
   1371         remainingIndices -= numIndicesForDraw;
   1372         draw++;
   1373     }
   1374 
   1375     // Restore culling state
   1376     pDC = GetDrawContext(pContext);
   1377     pDC->pState->state.rastState.cullMode = oldCullMode;
   1378 
   1379     AR_API_END(APIDrawIndexed, numIndices * numInstances);
   1380 }
   1381 
   1382 
   1383 //////////////////////////////////////////////////////////////////////////
   1384 /// @brief DrawIndexed
   1385 /// @param hContext - Handle passed back from SwrCreateContext
   1386 /// @param topology - Specifies topology for draw.
   1387 /// @param numIndices - Number of indices to read sequentially from index buffer.
   1388 /// @param indexOffset - Starting index into index buffer.
   1389 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
   1390 void SwrDrawIndexed(
   1391     HANDLE hContext,
   1392     PRIMITIVE_TOPOLOGY topology,
   1393     uint32_t numIndices,
   1394     uint32_t indexOffset,
   1395     int32_t baseVertex
   1396     )
   1397 {
   1398     DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
   1399 }
   1400 
   1401 //////////////////////////////////////////////////////////////////////////
   1402 /// @brief SwrDrawIndexedInstanced
   1403 /// @param hContext - Handle passed back from SwrCreateContext
   1404 /// @param topology - Specifies topology for draw.
   1405 /// @param numIndices - Number of indices to read sequentially from index buffer.
   1406 /// @param numInstances - Number of instances to render.
   1407 /// @param indexOffset - Starting index into index buffer.
   1408 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
   1409 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
   1410 void SwrDrawIndexedInstanced(
   1411     HANDLE hContext,
   1412     PRIMITIVE_TOPOLOGY topology,
   1413     uint32_t numIndices,
   1414     uint32_t numInstances,
   1415     uint32_t indexOffset,
   1416     int32_t baseVertex,
   1417     uint32_t startInstance)
   1418 {
   1419     DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
   1420 }
   1421 
   1422 //////////////////////////////////////////////////////////////////////////
   1423 /// @brief SwrInvalidateTiles
   1424 /// @param hContext - Handle passed back from SwrCreateContext
   1425 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
   1426 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
   1427 ///                         be hottile size-aligned.
   1428 void SWR_API SwrInvalidateTiles(
   1429     HANDLE hContext,
   1430     uint32_t attachmentMask,
   1431     const SWR_RECT& invalidateRect)
   1432 {
   1433     if (KNOB_TOSS_DRAW)
   1434     {
   1435         return;
   1436     }
   1437 
   1438     SWR_CONTEXT *pContext = GetContext(hContext);
   1439     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1440 
   1441     pDC->FeWork.type = DISCARDINVALIDATETILES;
   1442     pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
   1443     pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
   1444     pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect;
   1445     pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
   1446     pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
   1447     pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
   1448     pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
   1449 
   1450     //enqueue
   1451     QueueDraw(pContext);
   1452 
   1453     AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId));
   1454 }
   1455 
   1456 //////////////////////////////////////////////////////////////////////////
   1457 /// @brief SwrDiscardRect
   1458 /// @param hContext - Handle passed back from SwrCreateContext
   1459 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
   1460 /// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
   1461 ///               discarded.
   1462 void SWR_API SwrDiscardRect(
   1463     HANDLE hContext,
   1464     uint32_t attachmentMask,
   1465     const SWR_RECT& rect)
   1466 {
   1467     if (KNOB_TOSS_DRAW)
   1468     {
   1469         return;
   1470     }
   1471 
   1472     SWR_CONTEXT *pContext = GetContext(hContext);
   1473     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1474 
   1475     // Queue a load to the hottile
   1476     pDC->FeWork.type = DISCARDINVALIDATETILES;
   1477     pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
   1478     pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
   1479     pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
   1480     pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
   1481     pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
   1482     pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
   1483     pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
   1484 
   1485     //enqueue
   1486     QueueDraw(pContext);
   1487 
   1488     AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId));
   1489 }
   1490 
   1491 //////////////////////////////////////////////////////////////////////////
   1492 /// @brief SwrDispatch
   1493 /// @param hContext - Handle passed back from SwrCreateContext
   1494 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
   1495 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
   1496 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
   1497 void SwrDispatch(
   1498     HANDLE hContext,
   1499     uint32_t threadGroupCountX,
   1500     uint32_t threadGroupCountY,
   1501     uint32_t threadGroupCountZ)
   1502 {
   1503     if (KNOB_TOSS_DRAW)
   1504     {
   1505         return;
   1506     }
   1507 
   1508     SWR_CONTEXT *pContext = GetContext(hContext);
   1509     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1510 
   1511     AR_API_BEGIN(APIDispatch, pDC->drawId);
   1512     AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
   1513     pDC->isCompute = true;      // This is a compute context.
   1514 
   1515     COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
   1516 
   1517     pTaskData->threadGroupCountX = threadGroupCountX;
   1518     pTaskData->threadGroupCountY = threadGroupCountY;
   1519     pTaskData->threadGroupCountZ = threadGroupCountZ;
   1520 
   1521     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
   1522     uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
   1523     pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
   1524     pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
   1525 
   1526     QueueDispatch(pContext);
   1527     AR_API_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ);
   1528 }
   1529 
   1530 // Deswizzles, converts and stores current contents of the hot tiles to surface
   1531 // described by pState
   1532 void SWR_API SwrStoreTiles(
   1533     HANDLE hContext,
   1534     uint32_t attachmentMask,
   1535     SWR_TILE_STATE postStoreTileState,
   1536     const SWR_RECT& storeRect)
   1537 {
   1538     if (KNOB_TOSS_DRAW)
   1539     {
   1540         return;
   1541     }
   1542 
   1543     SWR_CONTEXT *pContext = GetContext(hContext);
   1544     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1545 
   1546     AR_API_BEGIN(APIStoreTiles, pDC->drawId);
   1547 
   1548     pDC->FeWork.type = STORETILES;
   1549     pDC->FeWork.pfnWork = ProcessStoreTiles;
   1550     pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask;
   1551     pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
   1552     pDC->FeWork.desc.storeTiles.rect = storeRect;
   1553     pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect;
   1554 
   1555     //enqueue
   1556     QueueDraw(pContext);
   1557 
   1558     AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
   1559 
   1560     AR_API_END(APIStoreTiles, 1);
   1561 }
   1562 
   1563 //////////////////////////////////////////////////////////////////////////
   1564 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
   1565 /// @param hContext - Handle passed back from SwrCreateContext
   1566 /// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
   1567 /// @param renderTargetArrayIndex - the RT array index to clear
   1568 /// @param clearColor - color use for clearing render targets
   1569 /// @param z - depth value use for clearing depth buffer
   1570 /// @param stencil - stencil value used for clearing stencil buffer
   1571 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
   1572 void SWR_API SwrClearRenderTarget(
   1573     HANDLE hContext,
   1574     uint32_t attachmentMask,
   1575     uint32_t renderTargetArrayIndex,
   1576     const float clearColor[4],
   1577     float z,
   1578     uint8_t stencil,
   1579     const SWR_RECT& clearRect)
   1580 {
   1581     if (KNOB_TOSS_DRAW)
   1582     {
   1583         return;
   1584     }
   1585 
   1586     SWR_CONTEXT *pContext = GetContext(hContext);
   1587     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1588 
   1589     AR_API_BEGIN(APIClearRenderTarget, pDC->drawId);
   1590 
   1591     pDC->FeWork.type = CLEAR;
   1592     pDC->FeWork.pfnWork = ProcessClear;
   1593     pDC->FeWork.desc.clear.rect = clearRect;
   1594     pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
   1595     pDC->FeWork.desc.clear.attachmentMask = attachmentMask;
   1596     pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex;
   1597     pDC->FeWork.desc.clear.clearDepth = z;
   1598     pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
   1599     pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
   1600     pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
   1601     pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
   1602     pDC->FeWork.desc.clear.clearStencil = stencil;
   1603 
   1604     // enqueue draw
   1605     QueueDraw(pContext);
   1606 
   1607     AR_API_END(APIClearRenderTarget, 1);
   1608 }
   1609 
   1610 //////////////////////////////////////////////////////////////////////////
   1611 /// @brief Returns a pointer to the private context state for the current
   1612 ///        draw operation. This is used for external componets such as the
   1613 ///        sampler.
   1614 ///        SWR is responsible for the allocation of the private context state.
   1615 /// @param hContext - Handle passed back from SwrCreateContext
   1616 VOID* SwrGetPrivateContextState(
   1617     HANDLE hContext)
   1618 {
   1619     SWR_CONTEXT* pContext = GetContext(hContext);
   1620     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1621     DRAW_STATE* pState = pDC->pState;
   1622 
   1623     if (pState->pPrivateState == nullptr)
   1624     {
   1625         pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
   1626     }
   1627 
   1628     return pState->pPrivateState;
   1629 }
   1630 
   1631 //////////////////////////////////////////////////////////////////////////
   1632 /// @brief Clients can use this to allocate memory for draw/dispatch
   1633 ///        operations. The memory will automatically be freed once operation
   1634 ///        has completed. Client can use this to allocate binding tables,
   1635 ///        etc. needed for shader execution.
   1636 /// @param hContext - Handle passed back from SwrCreateContext
   1637 /// @param size - Size of allocation
   1638 /// @param align - Alignment needed for allocation.
   1639 VOID* SwrAllocDrawContextMemory(
   1640     HANDLE hContext,
   1641     uint32_t size,
   1642     uint32_t align)
   1643 {
   1644     SWR_CONTEXT* pContext = GetContext(hContext);
   1645     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1646 
   1647     return pDC->pState->pArena->AllocAligned(size, align);
   1648 }
   1649 
   1650 //////////////////////////////////////////////////////////////////////////
   1651 /// @brief Enables stats counting
   1652 /// @param hContext - Handle passed back from SwrCreateContext
   1653 /// @param enable - If true then counts are incremented.
   1654 void SwrEnableStatsFE(
   1655     HANDLE hContext,
   1656     bool enable)
   1657 {
   1658     SWR_CONTEXT *pContext = GetContext(hContext);
   1659     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1660 
   1661     pDC->pState->state.enableStatsFE = enable;
   1662 }
   1663 
   1664 //////////////////////////////////////////////////////////////////////////
   1665 /// @brief Enables stats counting
   1666 /// @param hContext - Handle passed back from SwrCreateContext
   1667 /// @param enable - If true then counts are incremented.
   1668 void SwrEnableStatsBE(
   1669     HANDLE hContext,
   1670     bool enable)
   1671 {
   1672     SWR_CONTEXT *pContext = GetContext(hContext);
   1673     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1674 
   1675     pDC->pState->state.enableStatsBE = enable;
   1676 }
   1677 
   1678 //////////////////////////////////////////////////////////////////////////
   1679 /// @brief Mark end of frame - used for performance profiling
   1680 /// @param hContext - Handle passed back from SwrCreateContext
   1681 void SWR_API SwrEndFrame(
   1682     HANDLE hContext)
   1683 {
   1684     SWR_CONTEXT *pContext = GetContext(hContext);
   1685     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
   1686     (void)pDC; // var used
   1687 
   1688     RDTSC_ENDFRAME();
   1689     AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId));
   1690 
   1691     pContext->frameCount++;
   1692 }
   1693 
   1694 void InitSimLoadTilesTable();
   1695 void InitSimStoreTilesTable();
   1696 void InitSimClearTilesTable();
   1697 
   1698 void InitClearTilesTable();
   1699 void InitBackendFuncTables();
   1700 
   1701 //////////////////////////////////////////////////////////////////////////
   1702 /// @brief Initialize swr backend and memory internal tables
   1703 void SwrInit()
   1704 {
   1705     InitSimLoadTilesTable();
   1706     InitSimStoreTilesTable();
   1707     InitSimClearTilesTable();
   1708 
   1709     InitClearTilesTable();
   1710     InitBackendFuncTables();
   1711     InitRasterizerFunctions();
   1712 }
   1713 
   1714 void SwrGetInterface(SWR_INTERFACE &out_funcs)
   1715 {
   1716     out_funcs.pfnSwrCreateContext = SwrCreateContext;
   1717     out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
   1718     out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
   1719     out_funcs.pfnSwrSaveState = SwrSaveState;
   1720     out_funcs.pfnSwrRestoreState = SwrRestoreState;
   1721     out_funcs.pfnSwrSync = SwrSync;
   1722     out_funcs.pfnSwrStallBE = SwrStallBE;
   1723     out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
   1724     out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
   1725     out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
   1726     out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer;
   1727     out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc;
   1728     out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc;
   1729     out_funcs.pfnSwrSetSoState = SwrSetSoState;
   1730     out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers;
   1731     out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc;
   1732     out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState;
   1733     out_funcs.pfnSwrSetGsState = SwrSetGsState;
   1734     out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc;
   1735     out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc;
   1736     out_funcs.pfnSwrSetTsState = SwrSetTsState;
   1737     out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc;
   1738     out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc;
   1739     out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState;
   1740     out_funcs.pfnSwrSetBackendState = SwrSetBackendState;
   1741     out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState;
   1742     out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState;
   1743     out_funcs.pfnSwrSetBlendState = SwrSetBlendState;
   1744     out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc;
   1745     out_funcs.pfnSwrDraw = SwrDraw;
   1746     out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced;
   1747     out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed;
   1748     out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced;
   1749     out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles;
   1750     out_funcs.pfnSwrDiscardRect = SwrDiscardRect;
   1751     out_funcs.pfnSwrDispatch = SwrDispatch;
   1752     out_funcs.pfnSwrStoreTiles = SwrStoreTiles;
   1753     out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget;
   1754     out_funcs.pfnSwrSetRastState = SwrSetRastState;
   1755     out_funcs.pfnSwrSetViewports = SwrSetViewports;
   1756     out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects;
   1757     out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
   1758     out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
   1759     out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE;
   1760     out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE;
   1761     out_funcs.pfnSwrEndFrame = SwrEndFrame;
   1762     out_funcs.pfnSwrInit = SwrInit;
   1763     out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile;
   1764     out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface;
   1765     out_funcs.pfnSwrStoreHotTileClear = SwrStoreHotTileClear;
   1766 }
   1767