Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 ****************************************************************************/
     23 
     24 #include <stdio.h>
     25 #include <thread>
     26 #include <algorithm>
     27 #include <float.h>
     28 #include <vector>
     29 #include <utility>
     30 #include <fstream>
     31 #include <string>
     32 
     33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
     34 #include <pthread.h>
     35 #include <sched.h>
     36 #include <unistd.h>
     37 #endif
     38 
     39 #include "common/os.h"
     40 #include "context.h"
     41 #include "frontend.h"
     42 #include "backend.h"
     43 #include "rasterizer.h"
     44 #include "rdtsc_core.h"
     45 #include "tilemgr.h"
     46 
     47 
     48 
     49 
     50 // ThreadId
     51 struct Core
     52 {
     53     uint32_t                procGroup = 0;
     54     std::vector<uint32_t>   threadIds;
     55 };
     56 
     57 struct NumaNode
     58 {
     59     uint32_t          numaId;
     60     std::vector<Core> cores;
     61 };
     62 
     63 typedef std::vector<NumaNode> CPUNumaNodes;
     64 
     65 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
     66 {
     67     out_nodes.clear();
     68     out_numThreadsPerProcGroup = 0;
     69 
     70 #if defined(_WIN32)
     71 
     72     std::vector<KAFFINITY> threadMaskPerProcGroup;
     73 
     74     static std::mutex m;
     75     std::lock_guard<std::mutex> l(m);
     76 
     77     DWORD bufSize = 0;
     78 
     79     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
     80     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
     81 
     82     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
     83     SWR_ASSERT(pBufferMem);
     84 
     85     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
     86     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
     87 
     88     uint32_t count = bufSize / pBufferMem->Size;
     89     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
     90 
     91     for (uint32_t i = 0; i < count; ++i)
     92     {
     93         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
     94         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
     95         {
     96             auto& gmask = pBuffer->Processor.GroupMask[g];
     97             uint32_t threadId = 0;
     98             uint32_t procGroup = gmask.Group;
     99 
    100             Core* pCore = nullptr;
    101 
    102             uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
    103 
    104             while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
    105             {
    106                 // clear mask
    107                 KAFFINITY threadMask = KAFFINITY(1) << threadId;
    108                 gmask.Mask &= ~threadMask;
    109 
    110                 if (procGroup >= threadMaskPerProcGroup.size())
    111                 {
    112                     threadMaskPerProcGroup.resize(procGroup + 1);
    113                 }
    114 
    115                 if (threadMaskPerProcGroup[procGroup] & threadMask)
    116                 {
    117                     // Already seen this mask.  This means that we are in 32-bit mode and
    118                     // have seen more than 32 HW threads for this procGroup
    119                     // Don't use it
    120 #if defined(_WIN64)
    121                     SWR_INVALID("Shouldn't get here in 64-bit mode");
    122 #endif
    123                     continue;
    124                 }
    125 
    126                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
    127 
    128                 // Find Numa Node
    129                 uint32_t numaId = 0;
    130                 PROCESSOR_NUMBER procNum = {};
    131                 procNum.Group = WORD(procGroup);
    132                 procNum.Number = UCHAR(threadId);
    133 
    134                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
    135                 SWR_ASSERT(ret);
    136 
    137                 // Store data
    138                 if (out_nodes.size() <= numaId)
    139                 {
    140                     out_nodes.resize(numaId + 1);
    141                 }
    142                 auto& numaNode = out_nodes[numaId];
    143                 numaNode.numaId = numaId;
    144 
    145                 uint32_t coreId = 0;
    146 
    147                 if (nullptr == pCore)
    148                 {
    149                     numaNode.cores.push_back(Core());
    150                     pCore = &numaNode.cores.back();
    151                     pCore->procGroup = procGroup;
    152                 }
    153                 pCore->threadIds.push_back(threadId);
    154                 if (procGroup == 0)
    155                 {
    156                     out_numThreadsPerProcGroup++;
    157                 }
    158             }
    159         }
    160         pBuffer = PtrAdd(pBuffer, pBuffer->Size);
    161     }
    162 
    163     free(pBufferMem);
    164 
    165 
    166 #elif defined(__linux__) || defined (__gnu_linux__)
    167 
    168     // Parse /proc/cpuinfo to get full topology
    169     std::ifstream input("/proc/cpuinfo");
    170     std::string line;
    171     char* c;
    172     uint32_t procId = uint32_t(-1);
    173     uint32_t coreId = uint32_t(-1);
    174     uint32_t physId = uint32_t(-1);
    175 
    176     while (std::getline(input, line))
    177     {
    178         if (line.find("processor") != std::string::npos)
    179         {
    180             auto data_start = line.find(": ") + 2;
    181             procId = std::strtoul(&line.c_str()[data_start], &c, 10);
    182             continue;
    183         }
    184         if (line.find("core id") != std::string::npos)
    185         {
    186             auto data_start = line.find(": ") + 2;
    187             coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
    188             continue;
    189         }
    190         if (line.find("physical id") != std::string::npos)
    191         {
    192             auto data_start = line.find(": ") + 2;
    193             physId = std::strtoul(&line.c_str()[data_start], &c, 10);
    194             continue;
    195         }
    196         if (line.length() == 0)
    197         {
    198             if (physId + 1 > out_nodes.size())
    199                 out_nodes.resize(physId + 1);
    200             auto& numaNode = out_nodes[physId];
    201             numaNode.numaId = physId;
    202 
    203             if (coreId + 1 > numaNode.cores.size())
    204                 numaNode.cores.resize(coreId + 1);
    205             auto& core = numaNode.cores[coreId];
    206             core.procGroup = coreId;
    207             core.threadIds.push_back(procId);
    208         }
    209     }
    210 
    211     out_numThreadsPerProcGroup = 0;
    212     for (auto &node : out_nodes)
    213     {
    214         for (auto &core : node.cores)
    215         {
    216             out_numThreadsPerProcGroup += core.threadIds.size();
    217         }
    218     }
    219 
    220 #elif defined(__APPLE__)
    221 
    222 #else
    223 
    224 #error Unsupported platform
    225 
    226 #endif
    227 
    228     // Prune empty cores and numa nodes
    229     for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); )
    230     {
    231         // Erase empty cores (first)
    232         for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); )
    233         {
    234             if (core_it->threadIds.size() == 0)
    235             {
    236                 core_it = node_it->cores.erase(core_it);
    237             }
    238             else
    239             {
    240                 ++core_it;
    241             }
    242         }
    243 
    244         // Erase empty numa nodes (second)
    245         if (node_it->cores.size() == 0)
    246         {
    247             node_it = out_nodes.erase(node_it);
    248         }
    249         else
    250         {
    251             ++node_it;
    252         }
    253     }
    254 }
    255 
    256 
    257 void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
    258 {
    259     // Only bind threads when MAX_WORKER_THREADS isn't set.
    260     if (pContext->threadInfo.SINGLE_THREADED || (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
    261     {
    262         return;
    263     }
    264 
    265 #if defined(_WIN32)
    266 
    267     GROUP_AFFINITY affinity = {};
    268     affinity.Group = procGroupId;
    269 
    270 #if !defined(_WIN64)
    271     if (threadId >= 32)
    272     {
    273         // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
    274         SWR_INVALID("Shouldn't get here");
    275 
    276         // In a 32-bit process on Windows it is impossible to bind
    277         // to logical processors 32-63 within a processor group.
    278         // In this case set the mask to 0 and let the system assign
    279         // the processor.  Hopefully it will make smart choices.
    280         affinity.Mask = 0;
    281     }
    282     else
    283 #endif
    284     {
    285         // If MAX_WORKER_THREADS is set, only bind to the proc group,
    286         // Not the individual HW thread.
    287         if (!bindProcGroup  && !pContext->threadInfo.MAX_WORKER_THREADS)
    288         {
    289             affinity.Mask = KAFFINITY(1) << threadId;
    290         }
    291         else
    292         {
    293             affinity.Mask = KAFFINITY(0);
    294         }
    295     }
    296 
    297     if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
    298     {
    299         SWR_INVALID("Failed to set Thread Affinity");
    300     }
    301 
    302 #elif defined(__linux__) || defined(__gnu_linux__)
    303 
    304     cpu_set_t cpuset;
    305     pthread_t thread = pthread_self();
    306     CPU_ZERO(&cpuset);
    307     CPU_SET(threadId, &cpuset);
    308 
    309     int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    310     if (err != 0)
    311     {
    312         fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
    313     }
    314 
    315 #endif
    316 }
    317 
    318 INLINE
    319 uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
    320 {
    321     return pContext->dcRing.GetHead();
    322 }
    323 
    324 INLINE
    325 DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId)
    326 {
    327     return &pContext->dcRing[(drawId-1) % pContext->MAX_DRAWS_IN_FLIGHT];
    328 }
    329 
    330 INLINE
    331 bool IDComparesLess(uint32_t a, uint32_t b)
    332 {
    333     // Use signed delta to ensure that wrap-around to 0 is correctly handled.
    334     int32_t delta = int32_t(a - b);
    335     return (delta < 0);
    336 }
    337 
    338 // returns true if dependency not met
    339 INLINE
    340 bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
    341 {
    342     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
    343 }
    344 
    345 bool CheckDependencyFE(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
    346 {
    347     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
    348 }
    349 
    350 //////////////////////////////////////////////////////////////////////////
    351 /// @brief Update client stats.
    352 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
    353 {
    354     if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
    355     {
    356         return;
    357     }
    358 
    359     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
    360     OSALIGNLINE(SWR_STATS) stats{ 0 };
    361 
    362     // Sum up stats across all workers before sending to client.
    363     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
    364     {
    365         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
    366 
    367         stats.PsInvocations  += dynState.pStats[i].PsInvocations;
    368         stats.CsInvocations  += dynState.pStats[i].CsInvocations;
    369     }
    370 
    371 
    372     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
    373 }
    374 
    375 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
    376 {
    377     UpdateClientStats(pContext, workerId, pDC);
    378 
    379     if (pDC->retireCallback.pfnCallbackFunc)
    380     {
    381         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
    382             pDC->retireCallback.userData2,
    383             pDC->retireCallback.userData3);
    384     }
    385 }
    386 
    387 // inlined-only version
    388 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
    389 {
    390     int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
    391     SWR_ASSERT(result >= 0);
    392 
    393     AR_FLUSH(pDC->drawId);
    394 
    395     if (result == 0)
    396     {
    397         ExecuteCallbacks(pContext, workerId, pDC);
    398 
    399         // Cleanup memory allocations
    400         pDC->pArena->Reset(true);
    401         if (!pDC->isCompute)
    402         {
    403             pDC->pTileMgr->initialize();
    404         }
    405         if (pDC->cleanupState)
    406         {
    407             pDC->pState->pArena->Reset(true);
    408         }
    409 
    410         _ReadWriteBarrier();
    411 
    412         pContext->dcRing.Dequeue();  // Remove from tail
    413     }
    414 
    415     return result;
    416 }
    417 
    418 // available to other translation modules
    419 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
    420 {
    421     return CompleteDrawContextInl(pContext, 0, pDC);
    422 }
    423 
    424 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE, uint32_t& drawEnqueued)
    425 {
    426     // increment our current draw id to the first incomplete draw
    427     drawEnqueued = GetEnqueuedDraw(pContext);
    428     while (IDComparesLess(curDrawBE, drawEnqueued))
    429     {
    430         DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
    431 
    432         // If its not compute and FE is not done then break out of loop.
    433         if (!pDC->doneFE && !pDC->isCompute) break;
    434 
    435         bool isWorkComplete = pDC->isCompute ?
    436             pDC->pDispatch->isWorkComplete() :
    437             pDC->pTileMgr->isWorkComplete();
    438 
    439         if (isWorkComplete)
    440         {
    441             curDrawBE++;
    442             CompleteDrawContextInl(pContext, workerId, pDC);
    443         }
    444         else
    445         {
    446             break;
    447         }
    448     }
    449 
    450     // If there are no more incomplete draws then return false.
    451     return IDComparesLess(curDrawBE, drawEnqueued);
    452 }
    453 
    454 //////////////////////////////////////////////////////////////////////////
    455 /// @brief If there is any BE work then go work on it.
    456 /// @param pContext - pointer to SWR context.
    457 /// @param workerId - The unique worker ID that is assigned to this thread.
    458 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
    459 ///                    has its own curDrawBE counter and this ensures that each worker processes all the
    460 ///                    draws in order.
    461 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
    462 ///                      own set and each time it fails to lock a macrotile, because its already locked,
    463 ///                      then it will add that tile to the lockedTiles set. As a worker begins to work
    464 ///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
    465 ///                      still have work pending in a previous draw. Additionally, the lockedTiles is
    466 ///                      hueristic that can steer a worker back to the same macrotile that it had been
    467 ///                      working on in a previous draw.
    468 /// @returns        true if worker thread should shutdown
    469 bool WorkOnFifoBE(
    470     SWR_CONTEXT *pContext,
    471     uint32_t workerId,
    472     uint32_t &curDrawBE,
    473     TileSet& lockedTiles,
    474     uint32_t numaNode,
    475     uint32_t numaMask)
    476 {
    477     bool bShutdown = false;
    478 
    479     // Find the first incomplete draw that has pending work. If no such draw is found then
    480     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
    481     uint32_t drawEnqueued = 0;
    482     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
    483     {
    484         return false;
    485     }
    486 
    487     uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
    488 
    489     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
    490     lockedTiles.clear();
    491 
    492     // Try to work on each draw in order of the available draws in flight.
    493     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
    494     //   2. If we're trying to work on draws after curDrawBE, we are restricted to
    495     //      working on those macrotiles that are known to be complete in the prior draw to
    496     //      maintain order. The locked tiles provides the history to ensures this.
    497     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
    498     {
    499         DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
    500 
    501         if (pDC->isCompute) return false; // We don't look at compute work.
    502 
    503         // First wait for FE to be finished with this draw. This keeps threading model simple
    504         // but if there are lots of bubbles between draws then serializing FE and BE may
    505         // need to be revisited.
    506         if (!pDC->doneFE) return false;
    507 
    508         // If this draw is dependent on a previous draw then we need to bail.
    509         if (CheckDependency(pContext, pDC, lastRetiredDraw))
    510         {
    511             return false;
    512         }
    513 
    514         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
    515         auto &macroTiles = pDC->pTileMgr->getDirtyTiles();
    516 
    517         for (auto tile : macroTiles)
    518         {
    519             uint32_t tileID = tile->mId;
    520 
    521             // Only work on tiles for this numa node
    522             uint32_t x, y;
    523             pDC->pTileMgr->getTileIndices(tileID, x, y);
    524             if (((x ^ y) & numaMask) != numaNode)
    525             {
    526                 continue;
    527             }
    528 
    529             if (!tile->getNumQueued())
    530             {
    531                 continue;
    532             }
    533 
    534             // can only work on this draw if it's not in use by other threads
    535             if (lockedTiles.find(tileID) != lockedTiles.end())
    536             {
    537                 continue;
    538             }
    539 
    540             if (tile->tryLock())
    541             {
    542                 BE_WORK *pWork;
    543 
    544                 AR_BEGIN(WorkerFoundWork, pDC->drawId);
    545 
    546                 uint32_t numWorkItems = tile->getNumQueued();
    547                 SWR_ASSERT(numWorkItems);
    548 
    549                 pWork = tile->peek();
    550                 SWR_ASSERT(pWork);
    551                 if (pWork->type == DRAW)
    552                 {
    553                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
    554                 }
    555                 else if (pWork->type == SHUTDOWN)
    556                 {
    557                     bShutdown = true;
    558                 }
    559 
    560                 while ((pWork = tile->peek()) != nullptr)
    561                 {
    562                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
    563                     tile->dequeue();
    564                 }
    565                 AR_END(WorkerFoundWork, numWorkItems);
    566 
    567                 _ReadWriteBarrier();
    568 
    569                 pDC->pTileMgr->markTileComplete(tileID);
    570 
    571                 // Optimization: If the draw is complete and we're the last one to have worked on it then
    572                 // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
    573                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
    574                 {
    575                     // We can increment the current BE and safely move to next draw since we know this draw is complete.
    576                     curDrawBE++;
    577                     CompleteDrawContextInl(pContext, workerId, pDC);
    578 
    579                     lastRetiredDraw++;
    580 
    581                     lockedTiles.clear();
    582                     break;
    583                 }
    584 
    585                 if (bShutdown)
    586                 {
    587                     break;
    588                 }
    589             }
    590             else
    591             {
    592                 // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
    593                 lockedTiles.insert(tileID);
    594             }
    595         }
    596     }
    597 
    598     return bShutdown;
    599 }
    600 
    601 //////////////////////////////////////////////////////////////////////////
    602 /// @brief Called when FE work is complete for this DC.
    603 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
    604 {
    605     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
    606     {
    607         SWR_STATS_FE& stats = pDC->dynState.statsFE;
    608 
    609         AR_EVENT(FrontendStatsEvent(pDC->drawId,
    610             stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations,
    611             stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives,
    612             stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3],
    613             stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3]
    614         ));
    615 		AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
    616 
    617         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
    618     }
    619 
    620     if (pContext->pfnUpdateSoWriteOffset)
    621     {
    622         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
    623         {
    624             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
    625                 (pDC->pState->state.soBuffer[i].soWriteEnable))
    626             {
    627                 pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
    628             }
    629         }
    630     }
    631 
    632     // Ensure all streaming writes are globally visible before marking this FE done
    633     _mm_mfence();
    634     pDC->doneFE = true;
    635 
    636     InterlockedDecrement(&pContext->drawsOutstandingFE);
    637 }
    638 
    639 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
    640 {
    641     // Try to grab the next DC from the ring
    642     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
    643     while (IDComparesLess(curDrawFE, drawEnqueued))
    644     {
    645         uint32_t dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
    646         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
    647         if (pDC->isCompute || pDC->doneFE)
    648         {
    649             CompleteDrawContextInl(pContext, workerId, pDC);
    650             curDrawFE++;
    651         }
    652         else
    653         {
    654             break;
    655         }
    656     }
    657 
    658     uint32_t lastRetiredFE = curDrawFE - 1;
    659     uint32_t curDraw = curDrawFE;
    660     while (IDComparesLess(curDraw, drawEnqueued))
    661     {
    662         uint32_t dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
    663         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
    664 
    665         if (!pDC->isCompute && !pDC->FeLock)
    666         {
    667             if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
    668             {
    669                 return;
    670             }
    671 
    672             uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
    673             if (initial == 0)
    674             {
    675                 // successfully grabbed the DC, now run the FE
    676                 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
    677 
    678                 CompleteDrawFE(pContext, workerId, pDC);
    679             }
    680         }
    681         curDraw++;
    682     }
    683 }
    684 
    685 //////////////////////////////////////////////////////////////////////////
    686 /// @brief If there is any compute work then go work on it.
    687 /// @param pContext - pointer to SWR context.
    688 /// @param workerId - The unique worker ID that is assigned to this thread.
    689 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
    690 ///                    has its own curDrawBE counter and this ensures that each worker processes all the
    691 ///                    draws in order.
    692 void WorkOnCompute(
    693     SWR_CONTEXT *pContext,
    694     uint32_t workerId,
    695     uint32_t& curDrawBE)
    696 {
    697     uint32_t drawEnqueued = 0;
    698     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
    699     {
    700         return;
    701     }
    702 
    703     uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
    704 
    705     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
    706     {
    707         DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
    708         if (pDC->isCompute == false) return;
    709 
    710         // check dependencies
    711         if (CheckDependency(pContext, pDC, lastRetiredDraw))
    712         {
    713             return;
    714         }
    715 
    716         SWR_ASSERT(pDC->pDispatch != nullptr);
    717         DispatchQueue& queue = *pDC->pDispatch;
    718 
    719         // Is there any work remaining?
    720         if (queue.getNumQueued() > 0)
    721         {
    722             void* pSpillFillBuffer = nullptr;
    723             void* pScratchSpace = nullptr;
    724             uint32_t threadGroupId = 0;
    725             while (queue.getWork(threadGroupId))
    726             {
    727                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
    728                 queue.finishedWork();
    729             }
    730 
    731             // Ensure all streaming writes are globally visible before moving onto the next draw
    732             _mm_mfence();
    733         }
    734     }
    735 }
    736 
    737 void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
    738 {
    739     if (nullptr == pContext)
    740     {
    741         return;
    742     }
    743 
    744     if (apiThreadId >= pContext->threadPool.numReservedThreads)
    745     {
    746         if (pContext->threadPool.numReservedThreads)
    747         {
    748             const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
    749             // Just bind to the process group used for API thread 0
    750             bindThread(pContext, 0, threadData.procGroupId, true);
    751         }
    752         return;
    753     }
    754 
    755     const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
    756 
    757     bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
    758 }
    759 
    760 template<bool IsFEThread, bool IsBEThread>
    761 DWORD workerThreadMain(LPVOID pData)
    762 {
    763     THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
    764     SWR_CONTEXT *pContext = pThreadData->pContext;
    765     uint32_t threadId = pThreadData->threadId;
    766     uint32_t workerId = pThreadData->workerId;
    767 
    768     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
    769 
    770     {
    771         char threadName[64];
    772         sprintf_s(threadName,
    773 #if defined(_WIN32)
    774                   "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
    775 #else
    776                   // linux pthread name limited to 16 chars (including \0)
    777                   "w%03d-n%d-c%03d-t%d",
    778 #endif
    779             workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId);
    780         SetCurrentThreadName(threadName);
    781     }
    782 
    783     RDTSC_INIT(threadId);
    784 
    785     // Only need offset numa index from base for correct masking
    786     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
    787     uint32_t numaMask = pContext->threadPool.numaMask;
    788 
    789     // flush denormals to 0
    790     _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
    791 
    792     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
    793     // locked then we'll add it to this list so that we don't try and lock it again.
    794     TileSet lockedTiles;
    795 
    796     // each worker has the ability to work on any of the queued draws as long as certain
    797     // conditions are met. the data associated
    798     // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
    799     // has moved on to the next draw when he determines there is no more work to do. The api
    800     // thread will not increment the head of the dc ring until all workers have moved past the
    801     // current head.
    802     // the logic to determine what to work on is:
    803     // 1- try to work on the FE any draw that is queued. For now there are no dependencies
    804     //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
    805     //    we'll need dependency tracking to force serialization on FEs.  The worker will try
    806     //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
    807     //    trying until he reaches the tail.
    808     // 2- BE work must be done in strict order. we accomplish this today by pulling work off
    809     //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
    810     //    any work left by comparing the total # of binned work items and the total # of completed
    811     //    work items. If they are equal, then there is no more work to do for this draw, and
    812     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
    813     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
    814 
    815     auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
    816 
    817     uint32_t curDrawBE = 0;
    818     uint32_t curDrawFE = 0;
    819 
    820     bool bShutdown = false;
    821 
    822     while (true)
    823     {
    824         if (bShutdown && !threadHasWork(curDrawBE))
    825         {
    826             break;
    827         }
    828 
    829         uint32_t loop = 0;
    830         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
    831         {
    832             _mm_pause();
    833         }
    834 
    835         if (!threadHasWork(curDrawBE))
    836         {
    837             lock.lock();
    838 
    839             // check for thread idle condition again under lock
    840             if (threadHasWork(curDrawBE))
    841             {
    842                 lock.unlock();
    843                 continue;
    844             }
    845 
    846             pContext->FifosNotEmpty.wait(lock);
    847             lock.unlock();
    848         }
    849 
    850         if (IsBEThread)
    851         {
    852             AR_BEGIN(WorkerWorkOnFifoBE, 0);
    853             bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
    854             AR_END(WorkerWorkOnFifoBE, 0);
    855 
    856             WorkOnCompute(pContext, workerId, curDrawBE);
    857         }
    858 
    859         if (IsFEThread)
    860         {
    861             WorkOnFifoFE(pContext, workerId, curDrawFE);
    862 
    863             if (!IsBEThread)
    864             {
    865                 curDrawBE = curDrawFE;
    866             }
    867         }
    868     }
    869 
    870     return 0;
    871 }
    872 template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
    873 
    874 template <bool IsFEThread, bool IsBEThread>
    875 DWORD workerThreadInit(LPVOID pData)
    876 {
    877 #if defined(_WIN32)
    878     __try
    879 #endif // _WIN32
    880     {
    881         return workerThreadMain<IsFEThread, IsBEThread>(pData);
    882     }
    883 
    884 #if defined(_WIN32)
    885     __except(EXCEPTION_CONTINUE_SEARCH)
    886     {
    887     }
    888 
    889 #endif // _WIN32
    890 
    891     return 1;
    892 }
    893 template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
    894 
    895 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
    896 {
    897     // Initialize DRAW_CONTEXT's per-thread stats
    898     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
    899     {
    900         pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
    901         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
    902     }
    903 }
    904 
    905 //////////////////////////////////////////////////////////////////////////
    906 /// @brief Creates thread pool info but doesn't launch threads.
    907 /// @param pContext - pointer to context
    908 /// @param pPool - pointer to thread pool object.
    909 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
    910 {
    911     CPUNumaNodes nodes;
    912     uint32_t numThreadsPerProcGroup = 0;
    913     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
    914 
    915     // Assumption, for asymmetric topologies, multi-threaded cores will appear
    916     // in the list before single-threaded cores.  This appears to be true for
    917     // Windows when the total HW threads is limited to 64.
    918     uint32_t numHWNodes         = (uint32_t)nodes.size();
    919     uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
    920     uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
    921 
    922 #if defined(_WIN32) && !defined(_WIN64)
    923     if (!pContext->threadInfo.MAX_WORKER_THREADS)
    924     {
    925         // Limit 32-bit windows to bindable HW threads only
    926         if ((numHWCoresPerNode * numHWHyperThreads) > 32)
    927         {
    928             numHWCoresPerNode = 32 / numHWHyperThreads;
    929         }
    930     }
    931 #endif
    932 
    933     // Calculate num HW threads.  Due to asymmetric topologies, this is not
    934     // a trivial multiplication.
    935     uint32_t numHWThreads = 0;
    936     for (auto const& node : nodes)
    937     {
    938         for (auto const& core : node.cores)
    939         {
    940             numHWThreads += (uint32_t)core.threadIds.size();
    941         }
    942     }
    943 
    944     uint32_t numNodes           = numHWNodes;
    945     uint32_t numCoresPerNode    = numHWCoresPerNode;
    946     uint32_t numHyperThreads    = numHWHyperThreads;
    947 
    948     // Calc used threads per-core
    949     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
    950     {
    951         numHyperThreads -= pContext->threadInfo.BASE_THREAD;
    952     }
    953     else
    954     {
    955         SWR_ASSERT(
    956             false,
    957             "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
    958             pContext->threadInfo.BASE_THREAD,
    959             numHyperThreads);
    960         pContext->threadInfo.BASE_THREAD = 0;
    961     }
    962 
    963     if (pContext->threadInfo.MAX_THREADS_PER_CORE)
    964     {
    965         numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
    966     }
    967 
    968     // Prune any cores that don't support the number of threads
    969     if (numHyperThreads > 1)
    970     {
    971         for (auto& node : nodes)
    972         {
    973             uint32_t numUsableCores = 0;
    974             for (auto& core : node.cores)
    975             {
    976                 numUsableCores += (core.threadIds.size() >= numHyperThreads);
    977             }
    978             numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
    979         }
    980     }
    981 
    982     // Calc used cores per NUMA node
    983     if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
    984     {
    985         numCoresPerNode -= pContext->threadInfo.BASE_CORE;
    986     }
    987     else
    988     {
    989         SWR_ASSERT(
    990             false,
    991             "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
    992             pContext->threadInfo.BASE_CORE,
    993             numCoresPerNode);
    994         pContext->threadInfo.BASE_CORE = 0;
    995     }
    996 
    997     if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
    998     {
    999         numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
   1000     }
   1001 
   1002     // Calc used NUMA nodes
   1003     if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
   1004     {
   1005         numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
   1006     }
   1007     else
   1008     {
   1009         SWR_ASSERT(
   1010             false,
   1011             "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
   1012             pContext->threadInfo.BASE_NUMA_NODE,
   1013             numNodes);
   1014         pContext->threadInfo.BASE_NUMA_NODE = 0;
   1015     }
   1016 
   1017     if (pContext->threadInfo.MAX_NUMA_NODES)
   1018     {
   1019         numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
   1020     }
   1021 
   1022     // Calculate numThreads - at this point everything should be symmetric
   1023     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
   1024     SWR_REL_ASSERT(numThreads <= numHWThreads);
   1025 
   1026     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
   1027     uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
   1028     uint32_t numRemovedThreads = 0;
   1029 
   1030     if (pContext->threadInfo.SINGLE_THREADED)
   1031     {
   1032         numAPIReservedThreads = 0;
   1033         numThreads = 1;
   1034         pContext->NumWorkerThreads = 1;
   1035         pContext->NumFEThreads = 1;
   1036         pContext->NumBEThreads = 1;
   1037         pPool->numThreads = 0;
   1038     }
   1039     else if (pContext->threadInfo.MAX_WORKER_THREADS)
   1040     {
   1041         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
   1042         pContext->threadInfo.BASE_NUMA_NODE = 0;
   1043         pContext->threadInfo.BASE_CORE = 0;
   1044         pContext->threadInfo.BASE_THREAD = 0;
   1045         numAPIReservedThreads = 0;
   1046     }
   1047     else
   1048     {
   1049         if (numAPIReservedThreads >= numThreads)
   1050         {
   1051             numAPIReservedThreads = 0;
   1052         }
   1053         else if (numAPIReservedThreads)
   1054         {
   1055             numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
   1056 
   1057             if (0 == numAPIThreadsPerCore)
   1058             {
   1059                 numAPIThreadsPerCore = numHWHyperThreads;
   1060             }
   1061 
   1062             numRemovedThreads = numAPIReservedThreads;
   1063             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
   1064             {
   1065                 // Adjust removed threads to make logic below work
   1066                 numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
   1067             }
   1068 
   1069             numThreads -= numRemovedThreads;
   1070         }
   1071     }
   1072 
   1073     InitPerThreadStats(pContext, numThreads);
   1074 
   1075     if (pContext->threadInfo.SINGLE_THREADED)
   1076     {
   1077         return;
   1078     }
   1079 
   1080     if (numAPIReservedThreads)
   1081     {
   1082         pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
   1083         SWR_ASSERT(pPool->pApiThreadData);
   1084         if (!pPool->pApiThreadData)
   1085         {
   1086             numAPIReservedThreads = 0;
   1087         }
   1088     }
   1089     pPool->numReservedThreads = numAPIReservedThreads;
   1090 
   1091     pPool->numThreads = numThreads;
   1092     pContext->NumWorkerThreads = pPool->numThreads;
   1093 
   1094     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
   1095     SWR_ASSERT(pPool->pThreadData);
   1096     pPool->numaMask = 0;
   1097 
   1098 
   1099     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
   1100     SWR_ASSERT(pPool->pThreads);
   1101 
   1102     if (pContext->threadInfo.MAX_WORKER_THREADS)
   1103     {
   1104         bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
   1105         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
   1106         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
   1107         // But Windows will still require binding to specific process groups
   1108         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
   1109         {
   1110             pPool->pThreadData[workerId].workerId = workerId;
   1111             pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
   1112             pPool->pThreadData[workerId].threadId = 0;
   1113             pPool->pThreadData[workerId].numaId = 0;
   1114             pPool->pThreadData[workerId].coreId = 0;
   1115             pPool->pThreadData[workerId].htId = 0;
   1116             pPool->pThreadData[workerId].pContext = pContext;
   1117             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
   1118 
   1119             pContext->NumBEThreads++;
   1120             pContext->NumFEThreads++;
   1121         }
   1122     }
   1123     else
   1124     {
   1125         // numa distribution assumes workers on all nodes
   1126         bool useNuma = true;
   1127         if (numCoresPerNode * numHyperThreads == 1)
   1128         {
   1129             useNuma = false;
   1130         }
   1131 
   1132         if (useNuma)
   1133         {
   1134             pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
   1135         }
   1136         else
   1137         {
   1138             pPool->numaMask = 0;
   1139         }
   1140 
   1141         uint32_t workerId = 0;
   1142         uint32_t numReservedThreads = numAPIReservedThreads;
   1143         for (uint32_t n = 0; n < numNodes; ++n)
   1144         {
   1145             if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
   1146             {
   1147                 break;
   1148             }
   1149             auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
   1150             uint32_t numCores = numCoresPerNode;
   1151             for (uint32_t c = 0; c < numCores; ++c)
   1152             {
   1153                 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
   1154                 {
   1155                     break;
   1156                 }
   1157 
   1158                 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
   1159                 for (uint32_t t = 0; t < numHyperThreads; ++t)
   1160                 {
   1161                     if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
   1162                     {
   1163                         break;
   1164                     }
   1165 
   1166                     if (numRemovedThreads)
   1167                     {
   1168                         --numRemovedThreads;
   1169                         SWR_REL_ASSERT(numReservedThreads);
   1170                         --numReservedThreads;
   1171                         pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
   1172                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
   1173                         pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
   1174                         pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
   1175                         pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
   1176                         pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
   1177                         pPool->pApiThreadData[numReservedThreads].pContext = pContext;
   1178                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
   1179 
   1180 
   1181                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
   1182                         {
   1183                             --numReservedThreads;
   1184                             pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
   1185                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
   1186                             pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
   1187                             pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
   1188                             pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
   1189                             pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
   1190                             pPool->pApiThreadData[numReservedThreads].pContext = pContext;
   1191                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
   1192                         }
   1193 
   1194                         continue;
   1195                     }
   1196 
   1197                     SWR_ASSERT(workerId < numThreads);
   1198 
   1199                     pPool->pThreadData[workerId].workerId = workerId;
   1200                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
   1201                     pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
   1202                     pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
   1203                     pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
   1204                     pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
   1205                     pPool->pThreadData[workerId].pContext = pContext;
   1206                     pPool->pThreadData[workerId].forceBindProcGroup = false;
   1207 
   1208                     pContext->NumBEThreads++;
   1209                     pContext->NumFEThreads++;
   1210 
   1211                     ++workerId;
   1212                 }
   1213             }
   1214         }
   1215         SWR_ASSERT(workerId == pContext->NumWorkerThreads);
   1216     }
   1217 }
   1218 
   1219 //////////////////////////////////////////////////////////////////////////
   1220 /// @brief Launches worker threads in thread pool.
   1221 /// @param pContext - pointer to context
   1222 /// @param pPool - pointer to thread pool object.
   1223 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
   1224 {
   1225     if (pContext->threadInfo.SINGLE_THREADED)
   1226     {
   1227         return;
   1228     }
   1229 
   1230     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
   1231     {
   1232         pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
   1233     }
   1234 }
   1235 
   1236 //////////////////////////////////////////////////////////////////////////
   1237 /// @brief Destroys thread pool.
   1238 /// @param pContext - pointer to context
   1239 /// @param pPool - pointer to thread pool object.
   1240 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
   1241 {
   1242     if (!pContext->threadInfo.SINGLE_THREADED)
   1243     {
   1244         // Wait for all threads to finish
   1245         SwrWaitForIdle(pContext);
   1246 
   1247         // Wait for threads to finish and destroy them
   1248         for (uint32_t t = 0; t < pPool->numThreads; ++t)
   1249         {
   1250             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
   1251             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
   1252             pPool->pThreads[t]->detach();
   1253             delete(pPool->pThreads[t]);
   1254         }
   1255 
   1256         delete[] pPool->pThreads;
   1257 
   1258         // Clean up data used by threads
   1259         delete[] pPool->pThreadData;
   1260         delete[] pPool->pApiThreadData;
   1261     }
   1262 }
   1263