1 /**************************************************************************** 2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file context.h 24 * 25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT 26 * The SWR_CONTEXT is our global context and contains the DC ring, 27 * thread state, etc. 28 * 29 * The DRAW_CONTEXT contains all state associated with a draw operation. 30 * 31 ******************************************************************************/ 32 #pragma once 33 34 #include <condition_variable> 35 #include <algorithm> 36 37 #include "core/api.h" 38 #include "core/utils.h" 39 #include "core/arena.h" 40 #include "core/fifo.hpp" 41 #include "core/knobs.h" 42 #include "common/intrin.h" 43 #include "core/threads.h" 44 #include "ringbuffer.h" 45 #include "archrast/archrast.h" 46 47 // x.8 fixed point precision values 48 #define FIXED_POINT_SHIFT 8 49 #define FIXED_POINT_SCALE 256 50 51 // x.16 fixed point precision values 52 #define FIXED_POINT16_SHIFT 16 53 #define FIXED_POINT16_SCALE 65536 54 55 struct SWR_CONTEXT; 56 struct DRAW_CONTEXT; 57 58 struct TRI_FLAGS 59 { 60 uint32_t frontFacing : 1; 61 uint32_t yMajor : 1; 62 uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); 63 uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); 64 float pointSize; 65 uint32_t renderTargetArrayIndex; 66 uint32_t viewportIndex; 67 }; 68 69 ////////////////////////////////////////////////////////////////////////// 70 /// SWR_TRIANGLE_DESC 71 ///////////////////////////////////////////////////////////////////////// 72 struct SWR_TRIANGLE_DESC 73 { 74 float I[3]; 75 float J[3]; 76 float Z[3]; 77 float OneOverW[3]; 78 float recipDet; 79 80 float *pRecipW; 81 float *pAttribs; 82 float *pPerspAttribs; 83 float *pSamplePos; 84 float *pUserClipBuffer; 85 86 uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; 87 uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered 88 uint64_t anyCoveredSamples; 89 90 TRI_FLAGS triFlags; 91 }; 92 93 struct TRIANGLE_WORK_DESC 94 { 95 float *pTriBuffer; 96 float *pAttribs; 97 float *pUserClipBuffer; 98 uint32_t numAttribs; 99 TRI_FLAGS triFlags; 100 }; 101 102 struct CLEAR_DESC 103 { 104 SWR_RECT rect; 105 uint32_t attachmentMask; 106 uint32_t renderTargetArrayIndex; 107 float clearRTColor[4]; // RGBA_32F 108 float clearDepth; // [0..1] 109 uint8_t clearStencil; 110 }; 111 112 struct DISCARD_INVALIDATE_TILES_DESC 113 { 114 uint32_t attachmentMask; 115 SWR_RECT rect; 116 SWR_TILE_STATE newTileState; 117 bool createNewTiles; 118 bool fullTilesOnly; 119 }; 120 121 struct SYNC_DESC 122 { 123 PFN_CALLBACK_FUNC pfnCallbackFunc; 124 uint64_t userData; 125 uint64_t userData2; 126 uint64_t userData3; 127 }; 128 129 struct STORE_TILES_DESC 130 { 131 uint32_t attachmentMask; 132 SWR_TILE_STATE postStoreTileState; 133 SWR_RECT rect; 134 }; 135 136 struct COMPUTE_DESC 137 { 138 uint32_t threadGroupCountX; 139 uint32_t threadGroupCountY; 140 uint32_t threadGroupCountZ; 141 }; 142 143 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc); 144 145 enum WORK_TYPE 146 { 147 SYNC, 148 DRAW, 149 CLEAR, 150 DISCARDINVALIDATETILES, 151 STORETILES, 152 SHUTDOWN, 153 }; 154 155 OSALIGNSIMD(struct) BE_WORK 156 { 157 WORK_TYPE type; 158 PFN_WORK_FUNC pfnWork; 159 union 160 { 161 SYNC_DESC sync; 162 TRIANGLE_WORK_DESC tri; 163 CLEAR_DESC clear; 164 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; 165 STORE_TILES_DESC storeTiles; 166 } desc; 167 }; 168 169 struct DRAW_WORK 170 { 171 DRAW_CONTEXT* pDC; 172 union 173 { 174 uint32_t numIndices; // DrawIndexed: Number of indices for draw. 175 uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) 176 }; 177 union 178 { 179 const int32_t* pIB; // DrawIndexed: App supplied indices 180 uint32_t startVertex; // Draw: Starting vertex in VB to render from. 181 }; 182 int32_t baseVertex; 183 uint32_t numInstances; // Number of instances 184 uint32_t startInstance; // Instance offset 185 uint32_t startPrimID; // starting primitiveID for this draw batch 186 uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws) 187 SWR_FORMAT type; // index buffer type 188 }; 189 190 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc); 191 struct FE_WORK 192 { 193 WORK_TYPE type; 194 PFN_FE_WORK_FUNC pfnWork; 195 union 196 { 197 SYNC_DESC sync; 198 DRAW_WORK draw; 199 CLEAR_DESC clear; 200 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; 201 STORE_TILES_DESC storeTiles; 202 } desc; 203 }; 204 205 struct GUARDBANDS 206 { 207 float left[KNOB_NUM_VIEWPORTS_SCISSORS]; 208 float right[KNOB_NUM_VIEWPORTS_SCISSORS]; 209 float top[KNOB_NUM_VIEWPORTS_SCISSORS]; 210 float bottom[KNOB_NUM_VIEWPORTS_SCISSORS]; 211 }; 212 213 struct PA_STATE; 214 215 // function signature for pipeline stages that execute after primitive assembly 216 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 217 uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx); 218 219 #if ENABLE_AVX512_SIMD16 220 // function signature for pipeline stages that execute after primitive assembly 221 typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], 222 uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); 223 224 #endif 225 OSALIGNLINE(struct) API_STATE 226 { 227 // Vertex Buffers 228 SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS]; 229 230 // Index Buffer 231 SWR_INDEX_BUFFER_STATE indexBuffer; 232 233 // FS - Fetch Shader State 234 PFN_FETCH_FUNC pfnFetchFunc; 235 236 // VS - Vertex Shader State 237 PFN_VERTEX_FUNC pfnVertexFunc; 238 239 // GS - Geometry Shader State 240 PFN_GS_FUNC pfnGsFunc; 241 SWR_GS_STATE gsState; 242 243 // CS - Compute Shader 244 PFN_CS_FUNC pfnCsFunc; 245 uint32_t totalThreadsInGroup; 246 uint32_t totalSpillFillSize; 247 uint32_t scratchSpaceSize; 248 uint32_t scratchSpaceNumInstances; 249 250 // FE - Frontend State 251 SWR_FRONTEND_STATE frontendState; 252 253 // SOS - Streamout Shader State 254 PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; 255 256 // Streamout state 257 SWR_STREAMOUT_STATE soState; 258 mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; 259 260 // Tessellation State 261 PFN_HS_FUNC pfnHsFunc; 262 PFN_DS_FUNC pfnDsFunc; 263 SWR_TS_STATE tsState; 264 265 // Number of attributes used by the frontend (vs, so, gs) 266 uint32_t feNumAttributes; 267 268 PRIMITIVE_TOPOLOGY topology; 269 bool forceFront; 270 271 // RS - Rasterizer State 272 SWR_RASTSTATE rastState; 273 // floating point multisample offsets 274 float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2]; 275 276 GUARDBANDS gbState; 277 278 SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; 279 SWR_VIEWPORT_MATRICES vpMatrices; 280 281 SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; 282 SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS]; 283 bool scissorsTileAligned; 284 285 // Backend state 286 SWR_BACKEND_STATE backendState; 287 288 SWR_DEPTH_BOUNDS_STATE depthBoundsState; 289 290 // PS - Pixel shader state 291 SWR_PS_STATE psState; 292 293 SWR_DEPTH_STENCIL_STATE depthStencilState; 294 295 // OM - Output Merger State 296 SWR_BLEND_STATE blendState; 297 PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; 298 299 struct 300 { 301 uint32_t enableStatsFE : 1; // Enable frontend pipeline stats 302 uint32_t enableStatsBE : 1; // Enable backend pipeline stats 303 uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles 304 uint32_t depthHottileEnable: 1; // Enable depth buffer hottile 305 uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile 306 }; 307 308 PFN_QUANTIZE_DEPTH pfnQuantizeDepth; 309 }; 310 311 class MacroTileMgr; 312 class DispatchQueue; 313 314 struct RenderOutputBuffers 315 { 316 uint8_t* pColor[SWR_NUM_RENDERTARGETS]; 317 uint8_t* pDepth; 318 uint8_t* pStencil; 319 }; 320 321 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords 322 struct BarycentricCoeffs 323 { 324 simdscalar vIa; 325 simdscalar vIb; 326 simdscalar vIc; 327 328 simdscalar vJa; 329 simdscalar vJb; 330 simdscalar vJc; 331 332 simdscalar vZa; 333 simdscalar vZb; 334 simdscalar vZc; 335 336 simdscalar vRecipDet; 337 338 simdscalar vAOneOverW; 339 simdscalar vBOneOverW; 340 simdscalar vCOneOverW; 341 }; 342 343 // pipeline function pointer types 344 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); 345 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*, 346 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar const &); 347 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &); 348 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); 349 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t, 350 simdscalar const &, simdscalar const &); 351 352 struct BACKEND_FUNCS 353 { 354 PFN_BACKEND_FUNC pfnBackend; 355 }; 356 357 // Draw State 358 struct DRAW_STATE 359 { 360 API_STATE state; 361 362 void* pPrivateState; // Its required the driver sets this up for each draw. 363 364 // pipeline function pointers, filled in by API thread when setting up the draw 365 BACKEND_FUNCS backendFuncs; 366 PFN_PROCESS_PRIMS pfnProcessPrims; 367 #if USE_SIMD16_FRONTEND 368 PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16; 369 #endif 370 371 CachingArena* pArena; // This should only be used by API thread. 372 }; 373 374 struct DRAW_DYNAMIC_STATE 375 { 376 void Reset(uint32_t numThreads) 377 { 378 SWR_STATS* pSavePtr = pStats; 379 memset(this, 0, sizeof(*this)); 380 pStats = pSavePtr; 381 memset(pStats, 0, sizeof(SWR_STATS) * numThreads); 382 } 383 ///@todo Currently assumes only a single FE can do stream output for a draw. 384 uint32_t SoWriteOffset[4]; 385 bool SoWriteOffsetDirty[4]; 386 387 SWR_STATS_FE statsFE; // Only one FE thread per DC. 388 SWR_STATS* pStats; 389 }; 390 391 // Draw Context 392 // The api thread sets up a draw context that exists for the life of the draw. 393 // This draw context maintains all of the state needed for the draw operation. 394 struct DRAW_CONTEXT 395 { 396 SWR_CONTEXT* pContext; 397 union 398 { 399 MacroTileMgr* pTileMgr; 400 DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) 401 }; 402 DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread. 403 DRAW_DYNAMIC_STATE dynState; 404 405 CachingArena* pArena; 406 407 uint32_t drawId; 408 bool dependentFE; // Frontend work is dependent on all previous FE 409 bool dependent; // Backend work is dependent on all previous BE 410 bool isCompute; // Is this DC a compute context? 411 bool cleanupState; // True if this is the last draw using an entry in the state ring. 412 413 FE_WORK FeWork; 414 415 volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? 416 volatile OSALIGNLINE(uint32_t) FeLock; 417 volatile OSALIGNLINE(uint32_t) threadsDone; 418 419 SYNC_DESC retireCallback; // Call this func when this DC is retired. 420 }; 421 422 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); 423 424 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) 425 { 426 SWR_ASSERT(pDC != nullptr); 427 SWR_ASSERT(pDC->pState != nullptr); 428 429 return pDC->pState->state; 430 } 431 432 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC) 433 { 434 SWR_ASSERT(pDC != nullptr); 435 SWR_ASSERT(pDC->pState != nullptr); 436 437 return pDC->pState->pPrivateState; 438 } 439 440 class HotTileMgr; 441 442 struct SWR_CONTEXT 443 { 444 // Draw Context Ring 445 // Each draw needs its own state in order to support mulitple draws in flight across multiple threads. 446 // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number 447 // of draws that can be in flight at any given time. 448 // 449 // Description: 450 // 1. State - When an application first sets state we'll request a new draw context to use. 451 // a. If there are no available draw contexts then we'll have to wait until one becomes free. 452 // b. If one is available then set pCurDrawContext to point to it and mark it in use. 453 // c. All state calls set state on pCurDrawContext. 454 // 2. Draw - Creates submits a work item that is associated with current draw context. 455 // a. Set pPrevDrawContext = pCurDrawContext 456 // b. Set pCurDrawContext to NULL. 457 // 3. State - When an applications sets state after draw 458 // a. Same as step 1. 459 // b. State is copied from prev draw context to current. 460 RingBuffer<DRAW_CONTEXT> dcRing; 461 462 DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. 463 DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. 464 465 MacroTileMgr* pMacroTileManagerArray; 466 DispatchQueue* pDispatchQueueArray; 467 468 // Draw State Ring 469 // When draw are very large (lots of primitives) then the API thread will break these up. 470 // These split draws all have identical state. So instead of storing the state directly 471 // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs 472 // to reference a single entry in the DS ring. 473 RingBuffer<DRAW_STATE> dsRing; 474 475 uint32_t curStateId; // Current index to the next available entry in the DS ring. 476 477 uint32_t NumWorkerThreads; 478 uint32_t NumFEThreads; 479 uint32_t NumBEThreads; 480 481 THREAD_POOL threadPool; // Thread pool associated with this context 482 SWR_THREADING_INFO threadInfo; 483 SWR_API_THREADING_INFO apiThreadInfo; 484 485 uint32_t MAX_DRAWS_IN_FLIGHT; 486 487 std::condition_variable FifosNotEmpty; 488 std::mutex WaitLock; 489 490 uint32_t privateStateSize; 491 492 HotTileMgr *pHotTileMgr; 493 494 // Callback functions, passed in at create context time 495 PFN_LOAD_TILE pfnLoadTile; 496 PFN_STORE_TILE pfnStoreTile; 497 PFN_CLEAR_TILE pfnClearTile; 498 PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; 499 PFN_UPDATE_STATS pfnUpdateStats; 500 PFN_UPDATE_STATS_FE pfnUpdateStatsFE; 501 502 503 // Global Stats 504 SWR_STATS* pStats; 505 506 // Scratch space for workers. 507 uint8_t** ppScratch; 508 509 volatile OSALIGNLINE(uint32_t) drawsOutstandingFE; 510 511 OSALIGNLINE(CachingAllocator) cachingArenaAllocator; 512 uint32_t frameCount; 513 514 uint32_t lastFrameChecked; 515 uint64_t lastDrawChecked; 516 TileSet singleThreadLockedTiles; 517 518 // ArchRast thread contexts. 519 HANDLE* pArContext; 520 }; 521 522 #define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; } 523 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; } 524 525 // ArchRast instrumentation framework 526 #define AR_WORKER_CTX pContext->pArContext[workerId] 527 #define AR_API_CTX pContext->pArContext[pContext->NumWorkerThreads] 528 529 #ifdef KNOB_ENABLE_AR 530 #define _AR_BEGIN(ctx, type, id) ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id)) 531 #define _AR_END(ctx, type, count) ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count)) 532 #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event) 533 #define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id) 534 #else 535 #ifdef KNOB_ENABLE_RDTSC 536 #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type) 537 #define _AR_END(ctx, type, id) RDTSC_STOP(type, id, 0) 538 #else 539 #define _AR_BEGIN(ctx, type, id) (void)ctx 540 #define _AR_END(ctx, type, id) 541 #endif 542 #define _AR_EVENT(ctx, event) 543 #define _AR_FLUSH(ctx, id) 544 #endif 545 546 // Use these macros for api thread. 547 #define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id) 548 #define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count) 549 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event) 550 551 // Use these macros for worker threads. 552 #define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id) 553 #define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count) 554 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event) 555 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id) 556