Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file context.h
     24 *
     25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
     26 *        The SWR_CONTEXT is our global context and contains the DC ring,
     27 *        thread state, etc.
     28 *
     29 *        The DRAW_CONTEXT contains all state associated with a draw operation.
     30 *
     31 ******************************************************************************/
     32 #pragma once
     33 
     34 #include <condition_variable>
     35 #include <algorithm>
     36 
     37 #include "core/api.h"
     38 #include "core/utils.h"
     39 #include "core/arena.h"
     40 #include "core/fifo.hpp"
     41 #include "core/knobs.h"
     42 #include "common/intrin.h"
     43 #include "core/threads.h"
     44 #include "ringbuffer.h"
     45 #include "archrast/archrast.h"
     46 
     47 // x.8 fixed point precision values
     48 #define FIXED_POINT_SHIFT 8
     49 #define FIXED_POINT_SCALE 256
     50 
     51 // x.16 fixed point precision values
     52 #define FIXED_POINT16_SHIFT 16
     53 #define FIXED_POINT16_SCALE 65536
     54 
     55 struct SWR_CONTEXT;
     56 struct DRAW_CONTEXT;
     57 
     58 struct TRI_FLAGS
     59 {
     60     uint32_t frontFacing : 1;
     61     uint32_t yMajor : 1;
     62     uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
     63     uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
     64     float pointSize;
     65     uint32_t renderTargetArrayIndex;
     66     uint32_t viewportIndex;
     67 };
     68 
     69 //////////////////////////////////////////////////////////////////////////
     70 /// SWR_TRIANGLE_DESC
     71 /////////////////////////////////////////////////////////////////////////
     72 struct SWR_TRIANGLE_DESC
     73 {
     74     float I[3];
     75     float J[3];
     76     float Z[3];
     77     float OneOverW[3];
     78     float recipDet;
     79 
     80     float *pRecipW;
     81     float *pAttribs;
     82     float *pPerspAttribs;
     83     float *pSamplePos;
     84     float *pUserClipBuffer;
     85 
     86     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
     87     uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
     88     uint64_t anyCoveredSamples;
     89 
     90     TRI_FLAGS triFlags;
     91 };
     92 
     93 struct TRIANGLE_WORK_DESC
     94 {
     95     float *pTriBuffer;
     96     float *pAttribs;
     97     float *pUserClipBuffer;
     98     uint32_t numAttribs;
     99     TRI_FLAGS triFlags;
    100 };
    101 
    102 struct CLEAR_DESC
    103 {
    104     SWR_RECT rect;
    105     uint32_t attachmentMask;
    106     uint32_t renderTargetArrayIndex;
    107     float clearRTColor[4];  // RGBA_32F
    108     float clearDepth;   // [0..1]
    109     uint8_t clearStencil;
    110 };
    111 
    112 struct DISCARD_INVALIDATE_TILES_DESC
    113 {
    114     uint32_t attachmentMask;
    115     SWR_RECT rect;
    116     SWR_TILE_STATE newTileState;
    117     bool createNewTiles;
    118     bool fullTilesOnly;
    119 };
    120 
    121 struct SYNC_DESC
    122 {
    123     PFN_CALLBACK_FUNC pfnCallbackFunc;
    124     uint64_t userData;
    125     uint64_t userData2;
    126     uint64_t userData3;
    127 };
    128 
    129 struct STORE_TILES_DESC
    130 {
    131     uint32_t attachmentMask;
    132     SWR_TILE_STATE postStoreTileState;
    133     SWR_RECT rect;
    134 };
    135 
    136 struct COMPUTE_DESC
    137 {
    138     uint32_t threadGroupCountX;
    139     uint32_t threadGroupCountY;
    140     uint32_t threadGroupCountZ;
    141 };
    142 
    143 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
    144 
    145 enum WORK_TYPE
    146 {
    147     SYNC,
    148     DRAW,
    149     CLEAR,
    150     DISCARDINVALIDATETILES,
    151     STORETILES,
    152     SHUTDOWN,
    153 };
    154 
    155 OSALIGNSIMD(struct) BE_WORK
    156 {
    157     WORK_TYPE type;
    158     PFN_WORK_FUNC pfnWork;
    159     union
    160     {
    161         SYNC_DESC sync;
    162         TRIANGLE_WORK_DESC tri;
    163         CLEAR_DESC clear;
    164         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
    165         STORE_TILES_DESC storeTiles;
    166     } desc;
    167 };
    168 
    169 struct DRAW_WORK
    170 {
    171     DRAW_CONTEXT*   pDC;
    172     union
    173     {
    174         uint32_t   numIndices;      // DrawIndexed: Number of indices for draw.
    175         uint32_t   numVerts;        // Draw: Number of verts (triangles, lines, etc)
    176     };
    177     union
    178     {
    179         const int32_t* pIB;        // DrawIndexed: App supplied indices
    180         uint32_t   startVertex;    // Draw: Starting vertex in VB to render from.
    181     };
    182     int32_t    baseVertex;
    183     uint32_t   numInstances;        // Number of instances
    184     uint32_t   startInstance;       // Instance offset
    185     uint32_t   startPrimID;         // starting primitiveID for this draw batch
    186     uint32_t   startVertexID;       // starting VertexID for this draw batch (only needed for non-indexed draws)
    187     SWR_FORMAT type;                // index buffer type
    188 };
    189 
    190 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
    191 struct FE_WORK
    192 {
    193     WORK_TYPE type;
    194     PFN_FE_WORK_FUNC pfnWork;
    195     union
    196     {
    197         SYNC_DESC sync;
    198         DRAW_WORK draw;
    199         CLEAR_DESC clear;
    200         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
    201         STORE_TILES_DESC storeTiles;
    202     } desc;
    203 };
    204 
    205 struct GUARDBANDS
    206 {
    207     float left[KNOB_NUM_VIEWPORTS_SCISSORS];
    208     float right[KNOB_NUM_VIEWPORTS_SCISSORS];
    209     float top[KNOB_NUM_VIEWPORTS_SCISSORS];
    210     float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
    211 };
    212 
    213 struct PA_STATE;
    214 
    215 // function signature for pipeline stages that execute after primitive assembly
    216 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
    217     uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
    218 
    219 #if ENABLE_AVX512_SIMD16
    220 // function signature for pipeline stages that execute after primitive assembly
    221 typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
    222     uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
    223 
    224 #endif
    225 OSALIGNLINE(struct) API_STATE
    226 {
    227     // Vertex Buffers
    228     SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
    229 
    230     // Index Buffer
    231     SWR_INDEX_BUFFER_STATE  indexBuffer;
    232 
    233     // FS - Fetch Shader State
    234     PFN_FETCH_FUNC          pfnFetchFunc;
    235 
    236     // VS - Vertex Shader State
    237     PFN_VERTEX_FUNC         pfnVertexFunc;
    238 
    239     // GS - Geometry Shader State
    240     PFN_GS_FUNC             pfnGsFunc;
    241     SWR_GS_STATE            gsState;
    242 
    243     // CS - Compute Shader
    244     PFN_CS_FUNC             pfnCsFunc;
    245     uint32_t                totalThreadsInGroup;
    246     uint32_t                totalSpillFillSize;
    247     uint32_t                scratchSpaceSize;
    248     uint32_t                scratchSpaceNumInstances;
    249 
    250     // FE - Frontend State
    251     SWR_FRONTEND_STATE      frontendState;
    252 
    253     // SOS - Streamout Shader State
    254     PFN_SO_FUNC             pfnSoFunc[MAX_SO_STREAMS];
    255 
    256     // Streamout state
    257     SWR_STREAMOUT_STATE     soState;
    258     mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
    259 
    260     // Tessellation State
    261     PFN_HS_FUNC             pfnHsFunc;
    262     PFN_DS_FUNC             pfnDsFunc;
    263     SWR_TS_STATE            tsState;
    264 
    265     // Number of attributes used by the frontend (vs, so, gs)
    266     uint32_t                feNumAttributes;
    267 
    268     PRIMITIVE_TOPOLOGY      topology;
    269     bool                    forceFront;
    270 
    271     // RS - Rasterizer State
    272     SWR_RASTSTATE           rastState;
    273     // floating point multisample offsets
    274     float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
    275 
    276     GUARDBANDS               gbState;
    277 
    278     SWR_VIEWPORT            vp[KNOB_NUM_VIEWPORTS_SCISSORS];
    279     SWR_VIEWPORT_MATRICES   vpMatrices;
    280 
    281     SWR_RECT                scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
    282     SWR_RECT                scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
    283     bool                    scissorsTileAligned;
    284 
    285     // Backend state
    286     SWR_BACKEND_STATE       backendState;
    287 
    288     SWR_DEPTH_BOUNDS_STATE  depthBoundsState;
    289 
    290     // PS - Pixel shader state
    291     SWR_PS_STATE            psState;
    292 
    293     SWR_DEPTH_STENCIL_STATE depthStencilState;
    294 
    295     // OM - Output Merger State
    296     SWR_BLEND_STATE         blendState;
    297     PFN_BLEND_JIT_FUNC      pfnBlendFunc[SWR_NUM_RENDERTARGETS];
    298 
    299     struct
    300     {
    301         uint32_t enableStatsFE : 1;             // Enable frontend pipeline stats
    302         uint32_t enableStatsBE : 1;             // Enable backend pipeline stats
    303         uint32_t colorHottileEnable : 8;        // Bitmask of enabled color hottiles
    304         uint32_t depthHottileEnable: 1;         // Enable depth buffer hottile
    305         uint32_t stencilHottileEnable : 1;      // Enable stencil buffer hottile
    306     };
    307 
    308     PFN_QUANTIZE_DEPTH      pfnQuantizeDepth;
    309 };
    310 
    311 class MacroTileMgr;
    312 class DispatchQueue;
    313 
    314 struct RenderOutputBuffers
    315 {
    316     uint8_t* pColor[SWR_NUM_RENDERTARGETS];
    317     uint8_t* pDepth;
    318     uint8_t* pStencil;
    319 };
    320 
    321 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
    322 struct BarycentricCoeffs
    323 {
    324     simdscalar vIa;
    325     simdscalar vIb;
    326     simdscalar vIc;
    327 
    328     simdscalar vJa;
    329     simdscalar vJb;
    330     simdscalar vJc;
    331 
    332     simdscalar vZa;
    333     simdscalar vZb;
    334     simdscalar vZc;
    335 
    336     simdscalar vRecipDet;
    337 
    338     simdscalar vAOneOverW;
    339     simdscalar vBOneOverW;
    340     simdscalar vCOneOverW;
    341 };
    342 
    343 // pipeline function pointer types
    344 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
    345 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
    346                                  const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar const &);
    347 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
    348 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
    349 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
    350                                               simdscalar const &, simdscalar const &);
    351 
    352 struct BACKEND_FUNCS
    353 {
    354     PFN_BACKEND_FUNC pfnBackend;
    355 };
    356 
    357 // Draw State
    358 struct DRAW_STATE
    359 {
    360     API_STATE state;
    361 
    362     void* pPrivateState;  // Its required the driver sets this up for each draw.
    363 
    364     // pipeline function pointers, filled in by API thread when setting up the draw
    365     BACKEND_FUNCS backendFuncs;
    366     PFN_PROCESS_PRIMS pfnProcessPrims;
    367 #if USE_SIMD16_FRONTEND
    368     PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
    369 #endif
    370 
    371     CachingArena* pArena;     // This should only be used by API thread.
    372 };
    373 
    374 struct DRAW_DYNAMIC_STATE
    375 {
    376     void Reset(uint32_t numThreads)
    377     {
    378         SWR_STATS* pSavePtr = pStats;
    379         memset(this, 0, sizeof(*this));
    380         pStats = pSavePtr;
    381         memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
    382     }
    383     ///@todo Currently assumes only a single FE can do stream output for a draw.
    384     uint32_t SoWriteOffset[4];
    385     bool     SoWriteOffsetDirty[4];
    386 
    387     SWR_STATS_FE statsFE;   // Only one FE thread per DC.
    388     SWR_STATS*   pStats;
    389 };
    390 
    391 // Draw Context
    392 //    The api thread sets up a draw context that exists for the life of the draw.
    393 //    This draw context maintains all of the state needed for the draw operation.
    394 struct DRAW_CONTEXT
    395 {
    396     SWR_CONTEXT*    pContext;
    397     union
    398     {
    399         MacroTileMgr*   pTileMgr;
    400         DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
    401     };
    402     DRAW_STATE*     pState;             // Read-only state. Core should not update this outside of API thread.
    403     DRAW_DYNAMIC_STATE dynState;
    404 
    405     CachingArena*   pArena;
    406 
    407     uint32_t        drawId;
    408     bool            dependentFE;    // Frontend work is dependent on all previous FE
    409     bool            dependent;      // Backend work is dependent on all previous BE
    410     bool            isCompute;      // Is this DC a compute context?
    411     bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
    412 
    413     FE_WORK         FeWork;
    414 
    415     volatile OSALIGNLINE(bool)       doneFE;         // Is FE work done for this draw?
    416     volatile OSALIGNLINE(uint32_t)   FeLock;
    417     volatile OSALIGNLINE(uint32_t)   threadsDone;
    418 
    419     SYNC_DESC       retireCallback; // Call this func when this DC is retired.
    420 };
    421 
    422 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
    423 
    424 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
    425 {
    426     SWR_ASSERT(pDC != nullptr);
    427     SWR_ASSERT(pDC->pState != nullptr);
    428 
    429     return pDC->pState->state;
    430 }
    431 
    432 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
    433 {
    434     SWR_ASSERT(pDC != nullptr);
    435     SWR_ASSERT(pDC->pState != nullptr);
    436 
    437     return pDC->pState->pPrivateState;
    438 }
    439 
    440 class HotTileMgr;
    441 
    442 struct SWR_CONTEXT
    443 {
    444     // Draw Context Ring
    445     //  Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
    446     //  We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
    447     //  of draws that can be in flight at any given time.
    448     //
    449     //  Description:
    450     //  1. State - When an application first sets state we'll request a new draw context to use.
    451     //     a. If there are no available draw contexts then we'll have to wait until one becomes free.
    452     //     b. If one is available then set pCurDrawContext to point to it and mark it in use.
    453     //     c. All state calls set state on pCurDrawContext.
    454     //  2. Draw - Creates submits a work item that is associated with current draw context.
    455     //     a. Set pPrevDrawContext = pCurDrawContext
    456     //     b. Set pCurDrawContext to NULL.
    457     //  3. State - When an applications sets state after draw
    458     //     a. Same as step 1.
    459     //     b. State is copied from prev draw context to current.
    460     RingBuffer<DRAW_CONTEXT> dcRing;
    461 
    462     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
    463     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
    464 
    465     MacroTileMgr* pMacroTileManagerArray;
    466     DispatchQueue* pDispatchQueueArray;
    467 
    468     // Draw State Ring
    469     //  When draw are very large (lots of primitives) then the API thread will break these up.
    470     //  These split draws all have identical state. So instead of storing the state directly
    471     //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
    472     //  to reference a single entry in the DS ring.
    473     RingBuffer<DRAW_STATE> dsRing;
    474 
    475     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
    476 
    477     uint32_t NumWorkerThreads;
    478     uint32_t NumFEThreads;
    479     uint32_t NumBEThreads;
    480 
    481     THREAD_POOL threadPool; // Thread pool associated with this context
    482     SWR_THREADING_INFO threadInfo;
    483     SWR_API_THREADING_INFO apiThreadInfo;
    484 
    485     uint32_t MAX_DRAWS_IN_FLIGHT;
    486 
    487     std::condition_variable FifosNotEmpty;
    488     std::mutex WaitLock;
    489 
    490     uint32_t privateStateSize;
    491 
    492     HotTileMgr *pHotTileMgr;
    493 
    494     // Callback functions, passed in at create context time
    495     PFN_LOAD_TILE               pfnLoadTile;
    496     PFN_STORE_TILE              pfnStoreTile;
    497     PFN_CLEAR_TILE              pfnClearTile;
    498     PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
    499     PFN_UPDATE_STATS            pfnUpdateStats;
    500     PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
    501 
    502 
    503     // Global Stats
    504     SWR_STATS* pStats;
    505 
    506     // Scratch space for workers.
    507     uint8_t** ppScratch;
    508 
    509     volatile OSALIGNLINE(uint32_t)  drawsOutstandingFE;
    510 
    511     OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
    512     uint32_t frameCount;
    513 
    514     uint32_t lastFrameChecked;
    515     uint64_t lastDrawChecked;
    516     TileSet singleThreadLockedTiles;
    517 
    518     // ArchRast thread contexts.
    519     HANDLE* pArContext;
    520 };
    521 
    522 #define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; }
    523 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; }
    524 
    525 // ArchRast instrumentation framework
    526 #define AR_WORKER_CTX  pContext->pArContext[workerId]
    527 #define AR_API_CTX     pContext->pArContext[pContext->NumWorkerThreads]
    528 
    529 #ifdef KNOB_ENABLE_AR
    530     #define _AR_BEGIN(ctx, type, id)    ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id))
    531     #define _AR_END(ctx, type, count)   ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count))
    532     #define _AR_EVENT(ctx, event)       ArchRast::Dispatch(ctx, ArchRast::event)
    533     #define _AR_FLUSH(ctx, id)          ArchRast::FlushDraw(ctx, id)
    534 #else
    535     #ifdef KNOB_ENABLE_RDTSC
    536         #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type)
    537         #define _AR_END(ctx, type, id)   RDTSC_STOP(type, id, 0)
    538     #else
    539         #define _AR_BEGIN(ctx, type, id) (void)ctx
    540         #define _AR_END(ctx, type, id)
    541     #endif
    542     #define _AR_EVENT(ctx, event)
    543     #define _AR_FLUSH(ctx, id)
    544 #endif
    545 
    546 // Use these macros for api thread.
    547 #define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id)
    548 #define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count)
    549 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
    550 
    551 // Use these macros for worker threads.
    552 #define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id)
    553 #define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count)
    554 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
    555 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)
    556