1 /**************************************************************************** 2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file context.h 24 * 25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT 26 * The SWR_CONTEXT is our global context and contains the DC ring, 27 * thread state, etc. 28 * 29 * The DRAW_CONTEXT contains all state associated with a draw operation. 30 * 31 ******************************************************************************/ 32 #pragma once 33 34 #include <condition_variable> 35 #include <algorithm> 36 37 #include "core/api.h" 38 #include "core/utils.h" 39 #include "core/arena.h" 40 #include "core/fifo.hpp" 41 #include "core/knobs.h" 42 #include "common/simdintrin.h" 43 #include "core/threads.h" 44 #include "ringbuffer.h" 45 #include "archrast/archrast.h" 46 47 // x.8 fixed point precision values 48 #define FIXED_POINT_SHIFT 8 49 #define FIXED_POINT_SCALE 256 50 51 // x.16 fixed point precision values 52 #define FIXED_POINT16_SHIFT 16 53 #define FIXED_POINT16_SCALE 65536 54 55 struct SWR_CONTEXT; 56 struct DRAW_CONTEXT; 57 58 struct TRI_FLAGS 59 { 60 uint32_t frontFacing : 1; 61 uint32_t yMajor : 1; 62 uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); 63 uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); 64 float pointSize; 65 uint32_t primID; 66 uint32_t renderTargetArrayIndex; 67 uint32_t viewportIndex; 68 }; 69 70 ////////////////////////////////////////////////////////////////////////// 71 /// SWR_TRIANGLE_DESC 72 ///////////////////////////////////////////////////////////////////////// 73 struct SWR_TRIANGLE_DESC 74 { 75 float I[3]; 76 float J[3]; 77 float Z[3]; 78 float OneOverW[3]; 79 float recipDet; 80 81 float *pRecipW; 82 float *pAttribs; 83 float *pPerspAttribs; 84 float *pSamplePos; 85 float *pUserClipBuffer; 86 87 uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; 88 uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered 89 uint64_t anyCoveredSamples; 90 91 TRI_FLAGS triFlags; 92 }; 93 94 struct TRIANGLE_WORK_DESC 95 { 96 float *pTriBuffer; 97 float *pAttribs; 98 float *pUserClipBuffer; 99 uint32_t numAttribs; 100 TRI_FLAGS triFlags; 101 }; 102 103 struct CLEAR_DESC 104 { 105 SWR_RECT rect; 106 uint32_t attachmentMask; 107 uint32_t renderTargetArrayIndex; 108 float clearRTColor[4]; // RGBA_32F 109 float clearDepth; // [0..1] 110 uint8_t clearStencil; 111 }; 112 113 struct DISCARD_INVALIDATE_TILES_DESC 114 { 115 uint32_t attachmentMask; 116 SWR_RECT rect; 117 SWR_TILE_STATE newTileState; 118 bool createNewTiles; 119 bool fullTilesOnly; 120 }; 121 122 struct SYNC_DESC 123 { 124 PFN_CALLBACK_FUNC pfnCallbackFunc; 125 uint64_t userData; 126 uint64_t userData2; 127 uint64_t userData3; 128 }; 129 130 struct STORE_TILES_DESC 131 { 132 uint32_t attachmentMask; 133 SWR_TILE_STATE postStoreTileState; 134 SWR_RECT rect; 135 }; 136 137 struct COMPUTE_DESC 138 { 139 uint32_t threadGroupCountX; 140 uint32_t threadGroupCountY; 141 uint32_t threadGroupCountZ; 142 }; 143 144 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc); 145 146 enum WORK_TYPE 147 { 148 SYNC, 149 DRAW, 150 CLEAR, 151 DISCARDINVALIDATETILES, 152 STORETILES, 153 SHUTDOWN, 154 }; 155 156 OSALIGNSIMD(struct) BE_WORK 157 { 158 WORK_TYPE type; 159 PFN_WORK_FUNC pfnWork; 160 union 161 { 162 SYNC_DESC sync; 163 TRIANGLE_WORK_DESC tri; 164 CLEAR_DESC clear; 165 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; 166 STORE_TILES_DESC storeTiles; 167 } desc; 168 }; 169 170 struct DRAW_WORK 171 { 172 DRAW_CONTEXT* pDC; 173 union 174 { 175 uint32_t numIndices; // DrawIndexed: Number of indices for draw. 176 uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) 177 }; 178 union 179 { 180 const int32_t* pIB; // DrawIndexed: App supplied indices 181 uint32_t startVertex; // Draw: Starting vertex in VB to render from. 182 }; 183 int32_t baseVertex; 184 uint32_t numInstances; // Number of instances 185 uint32_t startInstance; // Instance offset 186 uint32_t startPrimID; // starting primitiveID for this draw batch 187 uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws) 188 SWR_FORMAT type; // index buffer type 189 }; 190 191 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc); 192 struct FE_WORK 193 { 194 WORK_TYPE type; 195 PFN_FE_WORK_FUNC pfnWork; 196 union 197 { 198 SYNC_DESC sync; 199 DRAW_WORK draw; 200 CLEAR_DESC clear; 201 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; 202 STORE_TILES_DESC storeTiles; 203 } desc; 204 }; 205 206 struct GUARDBANDS 207 { 208 float left[KNOB_NUM_VIEWPORTS_SCISSORS]; 209 float right[KNOB_NUM_VIEWPORTS_SCISSORS]; 210 float top[KNOB_NUM_VIEWPORTS_SCISSORS]; 211 float bottom[KNOB_NUM_VIEWPORTS_SCISSORS]; 212 }; 213 214 struct PA_STATE; 215 216 // function signature for pipeline stages that execute after primitive assembly 217 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 218 uint32_t primMask, simdscalari primID, simdscalari viewportIdx); 219 220 OSALIGNLINE(struct) API_STATE 221 { 222 // Vertex Buffers 223 SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS]; 224 225 // Index Buffer 226 SWR_INDEX_BUFFER_STATE indexBuffer; 227 228 // FS - Fetch Shader State 229 PFN_FETCH_FUNC pfnFetchFunc; 230 231 // VS - Vertex Shader State 232 PFN_VERTEX_FUNC pfnVertexFunc; 233 234 // GS - Geometry Shader State 235 PFN_GS_FUNC pfnGsFunc; 236 SWR_GS_STATE gsState; 237 238 // CS - Compute Shader 239 PFN_CS_FUNC pfnCsFunc; 240 uint32_t totalThreadsInGroup; 241 uint32_t totalSpillFillSize; 242 243 // FE - Frontend State 244 SWR_FRONTEND_STATE frontendState; 245 246 // SOS - Streamout Shader State 247 PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; 248 249 // Streamout state 250 SWR_STREAMOUT_STATE soState; 251 mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; 252 253 // Tessellation State 254 PFN_HS_FUNC pfnHsFunc; 255 PFN_DS_FUNC pfnDsFunc; 256 SWR_TS_STATE tsState; 257 258 // Number of attributes used by the frontend (vs, so, gs) 259 uint32_t feNumAttributes; 260 261 PRIMITIVE_TOPOLOGY topology; 262 bool forceFront; 263 264 // RS - Rasterizer State 265 SWR_RASTSTATE rastState; 266 // floating point multisample offsets 267 float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2]; 268 269 GUARDBANDS gbState; 270 271 SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; 272 SWR_VIEWPORT_MATRICES vpMatrices; 273 274 SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; 275 SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS]; 276 bool scissorsTileAligned; 277 278 // Backend state 279 SWR_BACKEND_STATE backendState; 280 281 SWR_DEPTH_BOUNDS_STATE depthBoundsState; 282 283 // PS - Pixel shader state 284 SWR_PS_STATE psState; 285 286 SWR_DEPTH_STENCIL_STATE depthStencilState; 287 288 // OM - Output Merger State 289 SWR_BLEND_STATE blendState; 290 PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; 291 292 struct 293 { 294 uint32_t enableStatsFE : 1; // Enable frontend pipeline stats 295 uint32_t enableStatsBE : 1; // Enable backend pipeline stats 296 uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles 297 uint32_t depthHottileEnable: 1; // Enable depth buffer hottile 298 uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile 299 }; 300 301 PFN_QUANTIZE_DEPTH pfnQuantizeDepth; 302 }; 303 304 class MacroTileMgr; 305 class DispatchQueue; 306 307 struct RenderOutputBuffers 308 { 309 uint8_t* pColor[SWR_NUM_RENDERTARGETS]; 310 uint8_t* pDepth; 311 uint8_t* pStencil; 312 }; 313 314 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords 315 struct BarycentricCoeffs 316 { 317 simdscalar vIa; 318 simdscalar vIb; 319 simdscalar vIc; 320 321 simdscalar vJa; 322 simdscalar vJb; 323 simdscalar vJc; 324 325 simdscalar vZa; 326 simdscalar vZb; 327 simdscalar vZc; 328 329 simdscalar vRecipDet; 330 331 simdscalar vAOneOverW; 332 simdscalar vBOneOverW; 333 simdscalar vCOneOverW; 334 }; 335 336 // pipeline function pointer types 337 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); 338 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*, 339 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar); 340 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &); 341 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); 342 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t, 343 const simdscalar, const simdscalar); 344 345 struct BACKEND_FUNCS 346 { 347 PFN_BACKEND_FUNC pfnBackend; 348 }; 349 350 // Draw State 351 struct DRAW_STATE 352 { 353 API_STATE state; 354 355 void* pPrivateState; // Its required the driver sets this up for each draw. 356 357 // pipeline function pointers, filled in by API thread when setting up the draw 358 BACKEND_FUNCS backendFuncs; 359 PFN_PROCESS_PRIMS pfnProcessPrims; 360 361 CachingArena* pArena; // This should only be used by API thread. 362 }; 363 364 struct DRAW_DYNAMIC_STATE 365 { 366 void Reset(uint32_t numThreads) 367 { 368 SWR_STATS* pSavePtr = pStats; 369 memset(this, 0, sizeof(*this)); 370 pStats = pSavePtr; 371 memset(pStats, 0, sizeof(SWR_STATS) * numThreads); 372 } 373 ///@todo Currently assumes only a single FE can do stream output for a draw. 374 uint32_t SoWriteOffset[4]; 375 bool SoWriteOffsetDirty[4]; 376 377 SWR_STATS_FE statsFE; // Only one FE thread per DC. 378 SWR_STATS* pStats; 379 }; 380 381 // Draw Context 382 // The api thread sets up a draw context that exists for the life of the draw. 383 // This draw context maintains all of the state needed for the draw operation. 384 struct DRAW_CONTEXT 385 { 386 SWR_CONTEXT* pContext; 387 union 388 { 389 MacroTileMgr* pTileMgr; 390 DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) 391 }; 392 DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread. 393 DRAW_DYNAMIC_STATE dynState; 394 395 CachingArena* pArena; 396 397 uint32_t drawId; 398 bool dependentFE; // Frontend work is dependent on all previous FE 399 bool dependent; // Backend work is dependent on all previous BE 400 bool isCompute; // Is this DC a compute context? 401 bool cleanupState; // True if this is the last draw using an entry in the state ring. 402 volatile bool doneFE; // Is FE work done for this draw? 403 404 FE_WORK FeWork; 405 406 volatile OSALIGNLINE(uint32_t) FeLock; 407 volatile int32_t threadsDone; 408 409 SYNC_DESC retireCallback; // Call this func when this DC is retired. 410 411 412 }; 413 414 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); 415 416 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) 417 { 418 SWR_ASSERT(pDC != nullptr); 419 SWR_ASSERT(pDC->pState != nullptr); 420 421 return pDC->pState->state; 422 } 423 424 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC) 425 { 426 SWR_ASSERT(pDC != nullptr); 427 SWR_ASSERT(pDC->pState != nullptr); 428 429 return pDC->pState->pPrivateState; 430 } 431 432 class HotTileMgr; 433 434 struct SWR_CONTEXT 435 { 436 // Draw Context Ring 437 // Each draw needs its own state in order to support mulitple draws in flight across multiple threads. 438 // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number 439 // of draws that can be in flight at any given time. 440 // 441 // Description: 442 // 1. State - When an application first sets state we'll request a new draw context to use. 443 // a. If there are no available draw contexts then we'll have to wait until one becomes free. 444 // b. If one is available then set pCurDrawContext to point to it and mark it in use. 445 // c. All state calls set state on pCurDrawContext. 446 // 2. Draw - Creates submits a work item that is associated with current draw context. 447 // a. Set pPrevDrawContext = pCurDrawContext 448 // b. Set pCurDrawContext to NULL. 449 // 3. State - When an applications sets state after draw 450 // a. Same as step 1. 451 // b. State is copied from prev draw context to current. 452 RingBuffer<DRAW_CONTEXT> dcRing; 453 454 DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. 455 DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. 456 457 MacroTileMgr* pMacroTileManagerArray; 458 DispatchQueue* pDispatchQueueArray; 459 460 // Draw State Ring 461 // When draw are very large (lots of primitives) then the API thread will break these up. 462 // These split draws all have identical state. So instead of storing the state directly 463 // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs 464 // to reference a single entry in the DS ring. 465 RingBuffer<DRAW_STATE> dsRing; 466 467 uint32_t curStateId; // Current index to the next available entry in the DS ring. 468 469 uint32_t NumWorkerThreads; 470 uint32_t NumFEThreads; 471 uint32_t NumBEThreads; 472 473 THREAD_POOL threadPool; // Thread pool associated with this context 474 SWR_THREADING_INFO threadInfo; 475 476 std::condition_variable FifosNotEmpty; 477 std::mutex WaitLock; 478 479 uint32_t privateStateSize; 480 481 HotTileMgr *pHotTileMgr; 482 483 // Callback functions, passed in at create context time 484 PFN_LOAD_TILE pfnLoadTile; 485 PFN_STORE_TILE pfnStoreTile; 486 PFN_CLEAR_TILE pfnClearTile; 487 PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; 488 PFN_UPDATE_STATS pfnUpdateStats; 489 PFN_UPDATE_STATS_FE pfnUpdateStatsFE; 490 491 492 // Global Stats 493 SWR_STATS* pStats; 494 495 // Scratch space for workers. 496 uint8_t** ppScratch; 497 498 volatile int32_t drawsOutstandingFE; 499 500 CachingAllocator cachingArenaAllocator; 501 uint32_t frameCount; 502 503 uint32_t lastFrameChecked; 504 uint64_t lastDrawChecked; 505 TileSet singleThreadLockedTiles; 506 507 // ArchRast thread contexts. 508 HANDLE* pArContext; 509 }; 510 511 #define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; } 512 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; } 513 514 // ArchRast instrumentation framework 515 #define AR_WORKER_CTX pContext->pArContext[workerId] 516 #define AR_API_CTX pContext->pArContext[pContext->NumWorkerThreads] 517 518 #ifdef KNOB_ENABLE_AR 519 #define _AR_BEGIN(ctx, type, id) ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id)) 520 #define _AR_END(ctx, type, count) ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count)) 521 #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event) 522 #else 523 #ifdef KNOB_ENABLE_RDTSC 524 #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type) 525 #define _AR_END(ctx, type, id) RDTSC_STOP(type, id, 0) 526 #else 527 #define _AR_BEGIN(ctx, type, id) (void)ctx 528 #define _AR_END(ctx, type, id) 529 #endif 530 #define _AR_EVENT(ctx, event) 531 #endif 532 533 // Use these macros for api thread. 534 #define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id) 535 #define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count) 536 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event) 537 538 // Use these macros for worker threads. 539 #define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id) 540 #define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count) 541 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event) 542