1 /**************************************************************************** 2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23 24 #include <stdio.h> 25 #include <thread> 26 #include <algorithm> 27 #include <float.h> 28 #include <vector> 29 #include <utility> 30 #include <fstream> 31 #include <string> 32 33 #if defined(__linux__) || defined(__gnu_linux__) 34 #include <pthread.h> 35 #include <sched.h> 36 #include <unistd.h> 37 #endif 38 39 #include "common/os.h" 40 #include "context.h" 41 #include "frontend.h" 42 #include "backend.h" 43 #include "rasterizer.h" 44 #include "rdtsc_core.h" 45 #include "tilemgr.h" 46 47 48 49 50 // ThreadId 51 struct Core 52 { 53 uint32_t procGroup = 0; 54 std::vector<uint32_t> threadIds; 55 }; 56 57 struct NumaNode 58 { 59 std::vector<Core> cores; 60 }; 61 62 typedef std::vector<NumaNode> CPUNumaNodes; 63 64 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) 65 { 66 out_nodes.clear(); 67 out_numThreadsPerProcGroup = 0; 68 69 #if defined(_WIN32) 70 71 std::vector<KAFFINITY> threadMaskPerProcGroup; 72 73 static std::mutex m; 74 std::lock_guard<std::mutex> l(m); 75 76 DWORD bufSize = 0; 77 78 BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize); 79 SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER); 80 81 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); 82 SWR_ASSERT(pBufferMem); 83 84 ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize); 85 SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); 86 87 uint32_t count = bufSize / pBufferMem->Size; 88 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem; 89 90 for (uint32_t i = 0; i < count; ++i) 91 { 92 SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); 93 for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) 94 { 95 auto& gmask = pBuffer->Processor.GroupMask[g]; 96 uint32_t threadId = 0; 97 uint32_t procGroup = gmask.Group; 98 99 Core* pCore = nullptr; 100 101 uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); 102 103 while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) 104 { 105 // clear mask 106 KAFFINITY threadMask = KAFFINITY(1) << threadId; 107 gmask.Mask &= ~threadMask; 108 109 if (procGroup >= threadMaskPerProcGroup.size()) 110 { 111 threadMaskPerProcGroup.resize(procGroup + 1); 112 } 113 114 if (threadMaskPerProcGroup[procGroup] & threadMask) 115 { 116 // Already seen this mask. This means that we are in 32-bit mode and 117 // have seen more than 32 HW threads for this procGroup 118 // Don't use it 119 #if defined(_WIN64) 120 SWR_ASSERT(false, "Shouldn't get here in 64-bit mode"); 121 #endif 122 continue; 123 } 124 125 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId); 126 127 // Find Numa Node 128 uint32_t numaId = 0; 129 PROCESSOR_NUMBER procNum = {}; 130 procNum.Group = WORD(procGroup); 131 procNum.Number = UCHAR(threadId); 132 133 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); 134 SWR_ASSERT(ret); 135 136 // Store data 137 if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); 138 auto& numaNode = out_nodes[numaId]; 139 140 uint32_t coreId = 0; 141 142 if (nullptr == pCore) 143 { 144 numaNode.cores.push_back(Core()); 145 pCore = &numaNode.cores.back(); 146 pCore->procGroup = procGroup; 147 } 148 pCore->threadIds.push_back(threadId); 149 if (procGroup == 0) 150 { 151 out_numThreadsPerProcGroup++; 152 } 153 } 154 } 155 pBuffer = PtrAdd(pBuffer, pBuffer->Size); 156 } 157 158 free(pBufferMem); 159 160 161 #elif defined(__linux__) || defined (__gnu_linux__) 162 163 // Parse /proc/cpuinfo to get full topology 164 std::ifstream input("/proc/cpuinfo"); 165 std::string line; 166 char* c; 167 uint32_t threadId = uint32_t(-1); 168 uint32_t coreId = uint32_t(-1); 169 uint32_t numaId = uint32_t(-1); 170 171 while (std::getline(input, line)) 172 { 173 if (line.find("processor") != std::string::npos) 174 { 175 if (threadId != uint32_t(-1)) 176 { 177 // Save information. 178 if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); 179 auto& numaNode = out_nodes[numaId]; 180 if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); 181 auto& core = numaNode.cores[coreId]; 182 183 core.procGroup = coreId; 184 core.threadIds.push_back(threadId); 185 186 out_numThreadsPerProcGroup++; 187 } 188 189 auto data_start = line.find(": ") + 2; 190 threadId = std::strtoul(&line.c_str()[data_start], &c, 10); 191 continue; 192 } 193 if (line.find("core id") != std::string::npos) 194 { 195 auto data_start = line.find(": ") + 2; 196 coreId = std::strtoul(&line.c_str()[data_start], &c, 10); 197 continue; 198 } 199 if (line.find("physical id") != std::string::npos) 200 { 201 auto data_start = line.find(": ") + 2; 202 numaId = std::strtoul(&line.c_str()[data_start], &c, 10); 203 continue; 204 } 205 } 206 207 if (threadId != uint32_t(-1)) 208 { 209 // Save information. 210 if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); 211 auto& numaNode = out_nodes[numaId]; 212 if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); 213 auto& core = numaNode.cores[coreId]; 214 215 core.procGroup = coreId; 216 core.threadIds.push_back(threadId); 217 out_numThreadsPerProcGroup++; 218 } 219 220 /* Prune empty numa nodes */ 221 for (auto it = out_nodes.begin(); it != out_nodes.end(); ) { 222 if ((*it).cores.size() == 0) 223 it = out_nodes.erase(it); 224 else 225 ++it; 226 } 227 228 /* Prune empty core nodes */ 229 for (uint32_t node = 0; node < out_nodes.size(); node++) { 230 auto& numaNode = out_nodes[node]; 231 auto it = numaNode.cores.begin(); 232 for ( ; it != numaNode.cores.end(); ) { 233 if (it->threadIds.size() == 0) 234 numaNode.cores.erase(it); 235 else 236 ++it; 237 } 238 } 239 240 #else 241 242 #error Unsupported platform 243 244 #endif 245 } 246 247 248 void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false) 249 { 250 // Only bind threads when MAX_WORKER_THREADS isn't set. 251 if (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false) 252 { 253 return; 254 } 255 256 #if defined(_WIN32) 257 258 GROUP_AFFINITY affinity = {}; 259 affinity.Group = procGroupId; 260 261 #if !defined(_WIN64) 262 if (threadId >= 32) 263 { 264 // Hopefully we don't get here. Logic in CreateThreadPool should prevent this. 265 SWR_REL_ASSERT(false, "Shouldn't get here"); 266 267 // In a 32-bit process on Windows it is impossible to bind 268 // to logical processors 32-63 within a processor group. 269 // In this case set the mask to 0 and let the system assign 270 // the processor. Hopefully it will make smart choices. 271 affinity.Mask = 0; 272 } 273 else 274 #endif 275 { 276 // If MAX_WORKER_THREADS is set, only bind to the proc group, 277 // Not the individual HW thread. 278 if (!pContext->threadInfo.MAX_WORKER_THREADS) 279 { 280 affinity.Mask = KAFFINITY(1) << threadId; 281 } 282 } 283 284 SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr); 285 286 #else 287 288 cpu_set_t cpuset; 289 pthread_t thread = pthread_self(); 290 CPU_ZERO(&cpuset); 291 CPU_SET(threadId, &cpuset); 292 293 pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); 294 295 #endif 296 } 297 298 INLINE 299 uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext) 300 { 301 return pContext->dcRing.GetHead(); 302 } 303 304 INLINE 305 DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId) 306 { 307 return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT]; 308 } 309 310 INLINE 311 bool IDComparesLess(uint32_t a, uint32_t b) 312 { 313 // Use signed delta to ensure that wrap-around to 0 is correctly handled. 314 int32_t delta = int32_t(a - b); 315 return (delta < 0); 316 } 317 318 // returns true if dependency not met 319 INLINE 320 bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw) 321 { 322 return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1); 323 } 324 325 bool CheckDependencyFE(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw) 326 { 327 return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1); 328 } 329 330 ////////////////////////////////////////////////////////////////////////// 331 /// @brief Update client stats. 332 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) 333 { 334 if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false)) 335 { 336 return; 337 } 338 339 DRAW_DYNAMIC_STATE& dynState = pDC->dynState; 340 SWR_STATS stats{ 0 }; 341 342 // Sum up stats across all workers before sending to client. 343 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) 344 { 345 stats.DepthPassCount += dynState.pStats[i].DepthPassCount; 346 347 stats.PsInvocations += dynState.pStats[i].PsInvocations; 348 stats.CsInvocations += dynState.pStats[i].CsInvocations; 349 } 350 351 352 pContext->pfnUpdateStats(GetPrivateState(pDC), &stats); 353 } 354 355 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) 356 { 357 UpdateClientStats(pContext, workerId, pDC); 358 359 if (pDC->retireCallback.pfnCallbackFunc) 360 { 361 pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData, 362 pDC->retireCallback.userData2, 363 pDC->retireCallback.userData3); 364 } 365 } 366 367 // inlined-only version 368 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) 369 { 370 int32_t result = InterlockedDecrement((volatile LONG*)&pDC->threadsDone); 371 SWR_ASSERT(result >= 0); 372 373 if (result == 0) 374 { 375 ExecuteCallbacks(pContext, workerId, pDC); 376 377 // Cleanup memory allocations 378 pDC->pArena->Reset(true); 379 if (!pDC->isCompute) 380 { 381 pDC->pTileMgr->initialize(); 382 } 383 if (pDC->cleanupState) 384 { 385 pDC->pState->pArena->Reset(true); 386 } 387 388 _ReadWriteBarrier(); 389 390 pContext->dcRing.Dequeue(); // Remove from tail 391 } 392 393 return result; 394 } 395 396 // available to other translation modules 397 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) 398 { 399 return CompleteDrawContextInl(pContext, 0, pDC); 400 } 401 402 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE, uint32_t& drawEnqueued) 403 { 404 // increment our current draw id to the first incomplete draw 405 drawEnqueued = GetEnqueuedDraw(pContext); 406 while (IDComparesLess(curDrawBE, drawEnqueued)) 407 { 408 DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; 409 410 // If its not compute and FE is not done then break out of loop. 411 if (!pDC->doneFE && !pDC->isCompute) break; 412 413 bool isWorkComplete = pDC->isCompute ? 414 pDC->pDispatch->isWorkComplete() : 415 pDC->pTileMgr->isWorkComplete(); 416 417 if (isWorkComplete) 418 { 419 curDrawBE++; 420 CompleteDrawContextInl(pContext, workerId, pDC); 421 } 422 else 423 { 424 break; 425 } 426 } 427 428 // If there are no more incomplete draws then return false. 429 return IDComparesLess(curDrawBE, drawEnqueued); 430 } 431 432 ////////////////////////////////////////////////////////////////////////// 433 /// @brief If there is any BE work then go work on it. 434 /// @param pContext - pointer to SWR context. 435 /// @param workerId - The unique worker ID that is assigned to this thread. 436 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread 437 /// has its own curDrawBE counter and this ensures that each worker processes all the 438 /// draws in order. 439 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its 440 /// own set and each time it fails to lock a macrotile, because its already locked, 441 /// then it will add that tile to the lockedTiles set. As a worker begins to work 442 /// on future draws the lockedTiles ensure that it doesn't work on tiles that may 443 /// still have work pending in a previous draw. Additionally, the lockedTiles is 444 /// hueristic that can steer a worker back to the same macrotile that it had been 445 /// working on in a previous draw. 446 /// @returns true if worker thread should shutdown 447 bool WorkOnFifoBE( 448 SWR_CONTEXT *pContext, 449 uint32_t workerId, 450 uint32_t &curDrawBE, 451 TileSet& lockedTiles, 452 uint32_t numaNode, 453 uint32_t numaMask) 454 { 455 bool bShutdown = false; 456 457 // Find the first incomplete draw that has pending work. If no such draw is found then 458 // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. 459 uint32_t drawEnqueued = 0; 460 if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false) 461 { 462 return false; 463 } 464 465 uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; 466 467 // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. 468 lockedTiles.clear(); 469 470 // Try to work on each draw in order of the available draws in flight. 471 // 1. If we're on curDrawBE, we can work on any macrotile that is available. 472 // 2. If we're trying to work on draws after curDrawBE, we are restricted to 473 // working on those macrotiles that are known to be complete in the prior draw to 474 // maintain order. The locked tiles provides the history to ensures this. 475 for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) 476 { 477 DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; 478 479 if (pDC->isCompute) return false; // We don't look at compute work. 480 481 // First wait for FE to be finished with this draw. This keeps threading model simple 482 // but if there are lots of bubbles between draws then serializing FE and BE may 483 // need to be revisited. 484 if (!pDC->doneFE) return false; 485 486 // If this draw is dependent on a previous draw then we need to bail. 487 if (CheckDependency(pContext, pDC, lastRetiredDraw)) 488 { 489 return false; 490 } 491 492 // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. 493 auto ¯oTiles = pDC->pTileMgr->getDirtyTiles(); 494 495 for (auto tile : macroTiles) 496 { 497 uint32_t tileID = tile->mId; 498 499 // Only work on tiles for this numa node 500 uint32_t x, y; 501 pDC->pTileMgr->getTileIndices(tileID, x, y); 502 if (((x ^ y) & numaMask) != numaNode) 503 { 504 continue; 505 } 506 507 if (!tile->getNumQueued()) 508 { 509 continue; 510 } 511 512 // can only work on this draw if it's not in use by other threads 513 if (lockedTiles.find(tileID) != lockedTiles.end()) 514 { 515 continue; 516 } 517 518 if (tile->tryLock()) 519 { 520 BE_WORK *pWork; 521 522 AR_BEGIN(WorkerFoundWork, pDC->drawId); 523 524 uint32_t numWorkItems = tile->getNumQueued(); 525 SWR_ASSERT(numWorkItems); 526 527 pWork = tile->peek(); 528 SWR_ASSERT(pWork); 529 if (pWork->type == DRAW) 530 { 531 pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID); 532 } 533 else if (pWork->type == SHUTDOWN) 534 { 535 bShutdown = true; 536 } 537 538 while ((pWork = tile->peek()) != nullptr) 539 { 540 pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); 541 tile->dequeue(); 542 } 543 AR_END(WorkerFoundWork, numWorkItems); 544 545 _ReadWriteBarrier(); 546 547 pDC->pTileMgr->markTileComplete(tileID); 548 549 // Optimization: If the draw is complete and we're the last one to have worked on it then 550 // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. 551 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete())) 552 { 553 // We can increment the current BE and safely move to next draw since we know this draw is complete. 554 curDrawBE++; 555 CompleteDrawContextInl(pContext, workerId, pDC); 556 557 lastRetiredDraw++; 558 559 lockedTiles.clear(); 560 break; 561 } 562 563 if (bShutdown) 564 { 565 break; 566 } 567 } 568 else 569 { 570 // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. 571 lockedTiles.insert(tileID); 572 } 573 } 574 } 575 576 return bShutdown; 577 } 578 579 ////////////////////////////////////////////////////////////////////////// 580 /// @brief Called when FE work is complete for this DC. 581 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) 582 { 583 if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE) 584 { 585 SWR_STATS_FE& stats = pDC->dynState.statsFE; 586 587 AR_EVENT(FrontendStatsEvent(pDC->drawId, 588 stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations, 589 stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives, 590 stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3], 591 stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3] 592 )); 593 AR_EVENT(FrontendDrawEndEvent(pDC->drawId)); 594 595 pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats); 596 } 597 598 if (pContext->pfnUpdateSoWriteOffset) 599 { 600 for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i) 601 { 602 if ((pDC->dynState.SoWriteOffsetDirty[i]) && 603 (pDC->pState->state.soBuffer[i].soWriteEnable)) 604 { 605 pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]); 606 } 607 } 608 } 609 610 // Ensure all streaming writes are globally visible before marking this FE done 611 _mm_mfence(); 612 pDC->doneFE = true; 613 614 InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE); 615 } 616 617 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE) 618 { 619 // Try to grab the next DC from the ring 620 uint32_t drawEnqueued = GetEnqueuedDraw(pContext); 621 while (IDComparesLess(curDrawFE, drawEnqueued)) 622 { 623 uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT; 624 DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; 625 if (pDC->isCompute || pDC->doneFE) 626 { 627 CompleteDrawContextInl(pContext, workerId, pDC); 628 curDrawFE++; 629 } 630 else 631 { 632 break; 633 } 634 } 635 636 uint32_t lastRetiredFE = curDrawFE - 1; 637 uint32_t curDraw = curDrawFE; 638 while (IDComparesLess(curDraw, drawEnqueued)) 639 { 640 uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; 641 DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; 642 643 if (!pDC->isCompute && !pDC->FeLock) 644 { 645 if (CheckDependencyFE(pContext, pDC, lastRetiredFE)) 646 { 647 return; 648 } 649 650 uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0); 651 if (initial == 0) 652 { 653 // successfully grabbed the DC, now run the FE 654 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc); 655 656 CompleteDrawFE(pContext, workerId, pDC); 657 } 658 } 659 curDraw++; 660 } 661 } 662 663 ////////////////////////////////////////////////////////////////////////// 664 /// @brief If there is any compute work then go work on it. 665 /// @param pContext - pointer to SWR context. 666 /// @param workerId - The unique worker ID that is assigned to this thread. 667 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread 668 /// has its own curDrawBE counter and this ensures that each worker processes all the 669 /// draws in order. 670 void WorkOnCompute( 671 SWR_CONTEXT *pContext, 672 uint32_t workerId, 673 uint32_t& curDrawBE) 674 { 675 uint32_t drawEnqueued = 0; 676 if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false) 677 { 678 return; 679 } 680 681 uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; 682 683 for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) 684 { 685 DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; 686 if (pDC->isCompute == false) return; 687 688 // check dependencies 689 if (CheckDependency(pContext, pDC, lastRetiredDraw)) 690 { 691 return; 692 } 693 694 SWR_ASSERT(pDC->pDispatch != nullptr); 695 DispatchQueue& queue = *pDC->pDispatch; 696 697 // Is there any work remaining? 698 if (queue.getNumQueued() > 0) 699 { 700 void* pSpillFillBuffer = nullptr; 701 uint32_t threadGroupId = 0; 702 while (queue.getWork(threadGroupId)) 703 { 704 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer); 705 queue.finishedWork(); 706 } 707 708 // Ensure all streaming writes are globally visible before moving onto the next draw 709 _mm_mfence(); 710 } 711 } 712 } 713 714 template<bool IsFEThread, bool IsBEThread> 715 DWORD workerThreadMain(LPVOID pData) 716 { 717 THREAD_DATA *pThreadData = (THREAD_DATA*)pData; 718 SWR_CONTEXT *pContext = pThreadData->pContext; 719 uint32_t threadId = pThreadData->threadId; 720 uint32_t workerId = pThreadData->workerId; 721 722 bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 723 724 RDTSC_INIT(threadId); 725 726 uint32_t numaNode = pThreadData->numaId; 727 uint32_t numaMask = pContext->threadPool.numaMask; 728 729 // flush denormals to 0 730 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); 731 732 // Track tiles locked by other threads. If we try to lock a macrotile and find its already 733 // locked then we'll add it to this list so that we don't try and lock it again. 734 TileSet lockedTiles; 735 736 // each worker has the ability to work on any of the queued draws as long as certain 737 // conditions are met. the data associated 738 // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 739 // has moved on to the next draw when he determines there is no more work to do. The api 740 // thread will not increment the head of the dc ring until all workers have moved past the 741 // current head. 742 // the logic to determine what to work on is: 743 // 1- try to work on the FE any draw that is queued. For now there are no dependencies 744 // on the FE work, so any worker can grab any FE and process in parallel. Eventually 745 // we'll need dependency tracking to force serialization on FEs. The worker will try 746 // to pick an FE by atomically incrementing a counter in the swr context. he'll keep 747 // trying until he reaches the tail. 748 // 2- BE work must be done in strict order. we accomplish this today by pulling work off 749 // the oldest draw (ie the head) of the dcRing. the worker can determine if there is 750 // any work left by comparing the total # of binned work items and the total # of completed 751 // work items. If they are equal, then there is no more work to do for this draw, and 752 // the worker can safely increment its oldestDraw counter and move on to the next draw. 753 std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); 754 755 auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; 756 757 uint32_t curDrawBE = 0; 758 uint32_t curDrawFE = 0; 759 760 bool bShutdown = false; 761 762 while (true) 763 { 764 if (bShutdown && !threadHasWork(curDrawBE)) 765 { 766 break; 767 } 768 769 uint32_t loop = 0; 770 while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) 771 { 772 _mm_pause(); 773 } 774 775 if (!threadHasWork(curDrawBE)) 776 { 777 lock.lock(); 778 779 // check for thread idle condition again under lock 780 if (threadHasWork(curDrawBE)) 781 { 782 lock.unlock(); 783 continue; 784 } 785 786 pContext->FifosNotEmpty.wait(lock); 787 lock.unlock(); 788 } 789 790 if (IsBEThread) 791 { 792 AR_BEGIN(WorkerWorkOnFifoBE, 0); 793 bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); 794 AR_END(WorkerWorkOnFifoBE, 0); 795 796 WorkOnCompute(pContext, workerId, curDrawBE); 797 } 798 799 if (IsFEThread) 800 { 801 WorkOnFifoFE(pContext, workerId, curDrawFE); 802 803 if (!IsBEThread) 804 { 805 curDrawBE = curDrawFE; 806 } 807 } 808 } 809 810 return 0; 811 } 812 template<> DWORD workerThreadMain<false, false>(LPVOID) = delete; 813 814 template <bool IsFEThread, bool IsBEThread> 815 DWORD workerThreadInit(LPVOID pData) 816 { 817 #if defined(_WIN32) 818 __try 819 #endif // _WIN32 820 { 821 return workerThreadMain<IsFEThread, IsBEThread>(pData); 822 } 823 824 #if defined(_WIN32) 825 __except(EXCEPTION_CONTINUE_SEARCH) 826 { 827 } 828 829 #endif // _WIN32 830 831 return 1; 832 } 833 template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete; 834 835 ////////////////////////////////////////////////////////////////////////// 836 /// @brief Creates thread pool info but doesn't launch threads. 837 /// @param pContext - pointer to context 838 /// @param pPool - pointer to thread pool object. 839 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) 840 { 841 bindThread(pContext, 0); 842 843 CPUNumaNodes nodes; 844 uint32_t numThreadsPerProcGroup = 0; 845 CalculateProcessorTopology(nodes, numThreadsPerProcGroup); 846 847 uint32_t numHWNodes = (uint32_t)nodes.size(); 848 uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); 849 uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); 850 851 // Calculate num HW threads. Due to asymmetric topologies, this is not 852 // a trivial multiplication. 853 uint32_t numHWThreads = 0; 854 for (auto& node : nodes) 855 { 856 for (auto& core : node.cores) 857 { 858 numHWThreads += (uint32_t)core.threadIds.size(); 859 } 860 } 861 862 uint32_t numNodes = numHWNodes; 863 uint32_t numCoresPerNode = numHWCoresPerNode; 864 uint32_t numHyperThreads = numHWHyperThreads; 865 866 if (pContext->threadInfo.MAX_NUMA_NODES) 867 { 868 numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES); 869 } 870 871 if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE) 872 { 873 numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE); 874 } 875 876 if (pContext->threadInfo.MAX_THREADS_PER_CORE) 877 { 878 numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE); 879 } 880 881 #if defined(_WIN32) && !defined(_WIN64) 882 if (!pContext->threadInfo.MAX_WORKER_THREADS) 883 { 884 // Limit 32-bit windows to bindable HW threads only 885 if ((numCoresPerNode * numHWHyperThreads) > 32) 886 { 887 numCoresPerNode = 32 / numHWHyperThreads; 888 } 889 } 890 #endif 891 892 // Calculate numThreads 893 uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; 894 numThreads = std::min(numThreads, numHWThreads); 895 896 if (pContext->threadInfo.MAX_WORKER_THREADS) 897 { 898 uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads; 899 numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads); 900 } 901 902 uint32_t numAPIReservedThreads = 1; 903 904 905 if (numThreads == 1) 906 { 907 // If only 1 worker threads, try to move it to an available 908 // HW thread. If that fails, use the API thread. 909 if (numCoresPerNode < numHWCoresPerNode) 910 { 911 numCoresPerNode++; 912 } 913 else if (numHyperThreads < numHWHyperThreads) 914 { 915 numHyperThreads++; 916 } 917 else if (numNodes < numHWNodes) 918 { 919 numNodes++; 920 } 921 else 922 { 923 pContext->threadInfo.SINGLE_THREADED = true; 924 } 925 } 926 else 927 { 928 // Save HW threads for the API if we can 929 if (numThreads > numAPIReservedThreads) 930 { 931 numThreads -= numAPIReservedThreads; 932 } 933 else 934 { 935 numAPIReservedThreads = 0; 936 } 937 } 938 939 if (pContext->threadInfo.SINGLE_THREADED) 940 { 941 numThreads = 1; 942 } 943 944 // Initialize DRAW_CONTEXT's per-thread stats 945 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) 946 { 947 pContext->dcRing[dc].dynState.pStats = new SWR_STATS[numThreads]; 948 memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads); 949 } 950 951 if (pContext->threadInfo.SINGLE_THREADED) 952 { 953 pContext->NumWorkerThreads = 1; 954 pContext->NumFEThreads = 1; 955 pContext->NumBEThreads = 1; 956 pPool->numThreads = 0; 957 958 return; 959 } 960 961 pPool->numThreads = numThreads; 962 pContext->NumWorkerThreads = pPool->numThreads; 963 964 pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); 965 pPool->numaMask = 0; 966 967 pPool->pThreads = new THREAD_PTR[pPool->numThreads]; 968 969 if (pContext->threadInfo.MAX_WORKER_THREADS) 970 { 971 bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup); 972 uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup; 973 // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads 974 // But Windows will still require binding to specific process groups 975 for (uint32_t workerId = 0; workerId < numThreads; ++workerId) 976 { 977 pPool->pThreadData[workerId].workerId = workerId; 978 pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups; 979 pPool->pThreadData[workerId].threadId = 0; 980 pPool->pThreadData[workerId].numaId = 0; 981 pPool->pThreadData[workerId].coreId = 0; 982 pPool->pThreadData[workerId].htId = 0; 983 pPool->pThreadData[workerId].pContext = pContext; 984 pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup; 985 986 pContext->NumBEThreads++; 987 pContext->NumFEThreads++; 988 } 989 } 990 else 991 { 992 pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.) 993 994 uint32_t workerId = 0; 995 for (uint32_t n = 0; n < numNodes; ++n) 996 { 997 auto& node = nodes[n]; 998 uint32_t numCores = numCoresPerNode; 999 for (uint32_t c = 0; c < numCores; ++c) 1000 { 1001 if (c >= node.cores.size()) 1002 { 1003 break; 1004 } 1005 1006 auto& core = node.cores[c]; 1007 for (uint32_t t = 0; t < numHyperThreads; ++t) 1008 { 1009 if (t >= core.threadIds.size()) 1010 { 1011 break; 1012 } 1013 1014 if (numAPIReservedThreads) 1015 { 1016 --numAPIReservedThreads; 1017 continue; 1018 } 1019 1020 SWR_ASSERT(workerId < numThreads); 1021 1022 pPool->pThreadData[workerId].workerId = workerId; 1023 pPool->pThreadData[workerId].procGroupId = core.procGroup; 1024 pPool->pThreadData[workerId].threadId = core.threadIds[t]; 1025 pPool->pThreadData[workerId].numaId = n; 1026 pPool->pThreadData[workerId].coreId = c; 1027 pPool->pThreadData[workerId].htId = t; 1028 pPool->pThreadData[workerId].pContext = pContext; 1029 1030 pContext->NumBEThreads++; 1031 pContext->NumFEThreads++; 1032 1033 ++workerId; 1034 } 1035 } 1036 } 1037 SWR_ASSERT(workerId == pContext->NumWorkerThreads); 1038 } 1039 } 1040 1041 ////////////////////////////////////////////////////////////////////////// 1042 /// @brief Launches worker threads in thread pool. 1043 /// @param pContext - pointer to context 1044 /// @param pPool - pointer to thread pool object. 1045 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) 1046 { 1047 if (pContext->threadInfo.SINGLE_THREADED) 1048 { 1049 return; 1050 } 1051 1052 for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId) 1053 { 1054 pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]); 1055 } 1056 } 1057 1058 ////////////////////////////////////////////////////////////////////////// 1059 /// @brief Destroys thread pool. 1060 /// @param pContext - pointer to context 1061 /// @param pPool - pointer to thread pool object. 1062 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) 1063 { 1064 if (!pContext->threadInfo.SINGLE_THREADED) 1065 { 1066 // Wait for all threads to finish 1067 SwrWaitForIdle(pContext); 1068 1069 // Wait for threads to finish and destroy them 1070 for (uint32_t t = 0; t < pPool->numThreads; ++t) 1071 { 1072 // Detach from thread. Cannot join() due to possibility (in Windows) of code 1073 // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns. 1074 pPool->pThreads[t]->detach(); 1075 delete(pPool->pThreads[t]); 1076 } 1077 1078 delete [] pPool->pThreads; 1079 1080 // Clean up data used by threads 1081 free(pPool->pThreadData); 1082 } 1083 } 1084