Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file context.h
     24 *
     25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
     26 *        The SWR_CONTEXT is our global context and contains the DC ring,
     27 *        thread state, etc.
     28 *
     29 *        The DRAW_CONTEXT contains all state associated with a draw operation.
     30 *
     31 ******************************************************************************/
     32 #pragma once
     33 
     34 #include <condition_variable>
     35 #include <algorithm>
     36 
     37 #include "core/api.h"
     38 #include "core/utils.h"
     39 #include "core/arena.h"
     40 #include "core/fifo.hpp"
     41 #include "core/knobs.h"
     42 #include "common/simdintrin.h"
     43 #include "core/threads.h"
     44 #include "ringbuffer.h"
     45 #include "archrast/archrast.h"
     46 
     47 // x.8 fixed point precision values
     48 #define FIXED_POINT_SHIFT 8
     49 #define FIXED_POINT_SCALE 256
     50 
     51 // x.16 fixed point precision values
     52 #define FIXED_POINT16_SHIFT 16
     53 #define FIXED_POINT16_SCALE 65536
     54 
     55 struct SWR_CONTEXT;
     56 struct DRAW_CONTEXT;
     57 
     58 struct TRI_FLAGS
     59 {
     60     uint32_t frontFacing : 1;
     61     uint32_t yMajor : 1;
     62     uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
     63     uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
     64     float pointSize;
     65     uint32_t primID;
     66     uint32_t renderTargetArrayIndex;
     67     uint32_t viewportIndex;
     68 };
     69 
     70 //////////////////////////////////////////////////////////////////////////
     71 /// SWR_TRIANGLE_DESC
     72 /////////////////////////////////////////////////////////////////////////
     73 struct SWR_TRIANGLE_DESC
     74 {
     75     float I[3];
     76     float J[3];
     77     float Z[3];
     78     float OneOverW[3];
     79     float recipDet;
     80 
     81     float *pRecipW;
     82     float *pAttribs;
     83     float *pPerspAttribs;
     84     float *pSamplePos;
     85     float *pUserClipBuffer;
     86 
     87     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
     88     uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
     89     uint64_t anyCoveredSamples;
     90 
     91     TRI_FLAGS triFlags;
     92 };
     93 
     94 struct TRIANGLE_WORK_DESC
     95 {
     96     float *pTriBuffer;
     97     float *pAttribs;
     98     float *pUserClipBuffer;
     99     uint32_t numAttribs;
    100     TRI_FLAGS triFlags;
    101 };
    102 
    103 struct CLEAR_DESC
    104 {
    105     SWR_RECT rect;
    106     uint32_t attachmentMask;
    107     uint32_t renderTargetArrayIndex;
    108     float clearRTColor[4];  // RGBA_32F
    109     float clearDepth;   // [0..1]
    110     uint8_t clearStencil;
    111 };
    112 
    113 struct DISCARD_INVALIDATE_TILES_DESC
    114 {
    115     uint32_t attachmentMask;
    116     SWR_RECT rect;
    117     SWR_TILE_STATE newTileState;
    118     bool createNewTiles;
    119     bool fullTilesOnly;
    120 };
    121 
    122 struct SYNC_DESC
    123 {
    124     PFN_CALLBACK_FUNC pfnCallbackFunc;
    125     uint64_t userData;
    126     uint64_t userData2;
    127     uint64_t userData3;
    128 };
    129 
    130 struct STORE_TILES_DESC
    131 {
    132     uint32_t attachmentMask;
    133     SWR_TILE_STATE postStoreTileState;
    134     SWR_RECT rect;
    135 };
    136 
    137 struct COMPUTE_DESC
    138 {
    139     uint32_t threadGroupCountX;
    140     uint32_t threadGroupCountY;
    141     uint32_t threadGroupCountZ;
    142 };
    143 
    144 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
    145 
    146 enum WORK_TYPE
    147 {
    148     SYNC,
    149     DRAW,
    150     CLEAR,
    151     DISCARDINVALIDATETILES,
    152     STORETILES,
    153     SHUTDOWN,
    154 };
    155 
    156 OSALIGNSIMD(struct) BE_WORK
    157 {
    158     WORK_TYPE type;
    159     PFN_WORK_FUNC pfnWork;
    160     union
    161     {
    162         SYNC_DESC sync;
    163         TRIANGLE_WORK_DESC tri;
    164         CLEAR_DESC clear;
    165         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
    166         STORE_TILES_DESC storeTiles;
    167     } desc;
    168 };
    169 
    170 struct DRAW_WORK
    171 {
    172     DRAW_CONTEXT*   pDC;
    173     union
    174     {
    175         uint32_t   numIndices;      // DrawIndexed: Number of indices for draw.
    176         uint32_t   numVerts;        // Draw: Number of verts (triangles, lines, etc)
    177     };
    178     union
    179     {
    180         const int32_t* pIB;        // DrawIndexed: App supplied indices
    181         uint32_t   startVertex;    // Draw: Starting vertex in VB to render from.
    182     };
    183     int32_t    baseVertex;
    184     uint32_t   numInstances;        // Number of instances
    185     uint32_t   startInstance;       // Instance offset
    186     uint32_t   startPrimID;         // starting primitiveID for this draw batch
    187     uint32_t   startVertexID;       // starting VertexID for this draw batch (only needed for non-indexed draws)
    188     SWR_FORMAT type;                // index buffer type
    189 };
    190 
    191 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
    192 struct FE_WORK
    193 {
    194     WORK_TYPE type;
    195     PFN_FE_WORK_FUNC pfnWork;
    196     union
    197     {
    198         SYNC_DESC sync;
    199         DRAW_WORK draw;
    200         CLEAR_DESC clear;
    201         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
    202         STORE_TILES_DESC storeTiles;
    203     } desc;
    204 };
    205 
    206 struct GUARDBANDS
    207 {
    208     float left[KNOB_NUM_VIEWPORTS_SCISSORS];
    209     float right[KNOB_NUM_VIEWPORTS_SCISSORS];
    210     float top[KNOB_NUM_VIEWPORTS_SCISSORS];
    211     float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
    212 };
    213 
    214 struct PA_STATE;
    215 
    216 // function signature for pipeline stages that execute after primitive assembly
    217 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
    218     uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
    219 
    220 OSALIGNLINE(struct) API_STATE
    221 {
    222     // Vertex Buffers
    223     SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
    224 
    225     // Index Buffer
    226     SWR_INDEX_BUFFER_STATE  indexBuffer;
    227 
    228     // FS - Fetch Shader State
    229     PFN_FETCH_FUNC          pfnFetchFunc;
    230 
    231     // VS - Vertex Shader State
    232     PFN_VERTEX_FUNC         pfnVertexFunc;
    233 
    234     // GS - Geometry Shader State
    235     PFN_GS_FUNC             pfnGsFunc;
    236     SWR_GS_STATE            gsState;
    237 
    238     // CS - Compute Shader
    239     PFN_CS_FUNC             pfnCsFunc;
    240     uint32_t                totalThreadsInGroup;
    241     uint32_t                totalSpillFillSize;
    242 
    243     // FE - Frontend State
    244     SWR_FRONTEND_STATE      frontendState;
    245 
    246     // SOS - Streamout Shader State
    247     PFN_SO_FUNC             pfnSoFunc[MAX_SO_STREAMS];
    248 
    249     // Streamout state
    250     SWR_STREAMOUT_STATE     soState;
    251     mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
    252 
    253     // Tessellation State
    254     PFN_HS_FUNC             pfnHsFunc;
    255     PFN_DS_FUNC             pfnDsFunc;
    256     SWR_TS_STATE            tsState;
    257 
    258     // Number of attributes used by the frontend (vs, so, gs)
    259     uint32_t                feNumAttributes;
    260 
    261     PRIMITIVE_TOPOLOGY      topology;
    262     bool                    forceFront;
    263 
    264     // RS - Rasterizer State
    265     SWR_RASTSTATE           rastState;
    266     // floating point multisample offsets
    267     float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
    268 
    269     GUARDBANDS               gbState;
    270 
    271     SWR_VIEWPORT            vp[KNOB_NUM_VIEWPORTS_SCISSORS];
    272     SWR_VIEWPORT_MATRICES   vpMatrices;
    273 
    274     SWR_RECT                scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
    275     SWR_RECT                scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
    276     bool                    scissorsTileAligned;
    277 
    278     // Backend state
    279     SWR_BACKEND_STATE       backendState;
    280 
    281     SWR_DEPTH_BOUNDS_STATE  depthBoundsState;
    282 
    283     // PS - Pixel shader state
    284     SWR_PS_STATE            psState;
    285 
    286     SWR_DEPTH_STENCIL_STATE depthStencilState;
    287 
    288     // OM - Output Merger State
    289     SWR_BLEND_STATE         blendState;
    290     PFN_BLEND_JIT_FUNC      pfnBlendFunc[SWR_NUM_RENDERTARGETS];
    291 
    292     struct
    293     {
    294         uint32_t enableStatsFE : 1;             // Enable frontend pipeline stats
    295         uint32_t enableStatsBE : 1;             // Enable backend pipeline stats
    296         uint32_t colorHottileEnable : 8;        // Bitmask of enabled color hottiles
    297         uint32_t depthHottileEnable: 1;         // Enable depth buffer hottile
    298         uint32_t stencilHottileEnable : 1;      // Enable stencil buffer hottile
    299     };
    300 
    301     PFN_QUANTIZE_DEPTH      pfnQuantizeDepth;
    302 };
    303 
    304 class MacroTileMgr;
    305 class DispatchQueue;
    306 
    307 struct RenderOutputBuffers
    308 {
    309     uint8_t* pColor[SWR_NUM_RENDERTARGETS];
    310     uint8_t* pDepth;
    311     uint8_t* pStencil;
    312 };
    313 
    314 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
    315 struct BarycentricCoeffs
    316 {
    317     simdscalar vIa;
    318     simdscalar vIb;
    319     simdscalar vIc;
    320 
    321     simdscalar vJa;
    322     simdscalar vJb;
    323     simdscalar vJc;
    324 
    325     simdscalar vZa;
    326     simdscalar vZb;
    327     simdscalar vZc;
    328 
    329     simdscalar vRecipDet;
    330 
    331     simdscalar vAOneOverW;
    332     simdscalar vBOneOverW;
    333     simdscalar vCOneOverW;
    334 };
    335 
    336 // pipeline function pointer types
    337 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
    338 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
    339                                  const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
    340 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
    341 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
    342 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
    343                                               const simdscalar, const simdscalar);
    344 
    345 struct BACKEND_FUNCS
    346 {
    347     PFN_BACKEND_FUNC pfnBackend;
    348 };
    349 
    350 // Draw State
    351 struct DRAW_STATE
    352 {
    353     API_STATE state;
    354 
    355     void* pPrivateState;  // Its required the driver sets this up for each draw.
    356 
    357     // pipeline function pointers, filled in by API thread when setting up the draw
    358     BACKEND_FUNCS backendFuncs;
    359     PFN_PROCESS_PRIMS pfnProcessPrims;
    360 
    361     CachingArena* pArena;     // This should only be used by API thread.
    362 };
    363 
    364 struct DRAW_DYNAMIC_STATE
    365 {
    366     void Reset(uint32_t numThreads)
    367     {
    368         SWR_STATS* pSavePtr = pStats;
    369         memset(this, 0, sizeof(*this));
    370         pStats = pSavePtr;
    371         memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
    372     }
    373     ///@todo Currently assumes only a single FE can do stream output for a draw.
    374     uint32_t SoWriteOffset[4];
    375     bool     SoWriteOffsetDirty[4];
    376 
    377     SWR_STATS_FE statsFE;   // Only one FE thread per DC.
    378     SWR_STATS*   pStats;
    379 };
    380 
    381 // Draw Context
    382 //    The api thread sets up a draw context that exists for the life of the draw.
    383 //    This draw context maintains all of the state needed for the draw operation.
    384 struct DRAW_CONTEXT
    385 {
    386     SWR_CONTEXT*    pContext;
    387     union
    388     {
    389         MacroTileMgr*   pTileMgr;
    390         DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
    391     };
    392     DRAW_STATE*     pState;             // Read-only state. Core should not update this outside of API thread.
    393     DRAW_DYNAMIC_STATE dynState;
    394 
    395     CachingArena*   pArena;
    396 
    397     uint32_t        drawId;
    398     bool            dependentFE;    // Frontend work is dependent on all previous FE
    399     bool            dependent;      // Backend work is dependent on all previous BE
    400     bool            isCompute;      // Is this DC a compute context?
    401     bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
    402     volatile bool   doneFE;         // Is FE work done for this draw?
    403 
    404     FE_WORK         FeWork;
    405 
    406     volatile OSALIGNLINE(uint32_t)   FeLock;
    407     volatile int32_t    threadsDone;
    408 
    409     SYNC_DESC       retireCallback; // Call this func when this DC is retired.
    410 
    411 
    412 };
    413 
    414 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
    415 
    416 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
    417 {
    418     SWR_ASSERT(pDC != nullptr);
    419     SWR_ASSERT(pDC->pState != nullptr);
    420 
    421     return pDC->pState->state;
    422 }
    423 
    424 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
    425 {
    426     SWR_ASSERT(pDC != nullptr);
    427     SWR_ASSERT(pDC->pState != nullptr);
    428 
    429     return pDC->pState->pPrivateState;
    430 }
    431 
    432 class HotTileMgr;
    433 
    434 struct SWR_CONTEXT
    435 {
    436     // Draw Context Ring
    437     //  Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
    438     //  We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
    439     //  of draws that can be in flight at any given time.
    440     //
    441     //  Description:
    442     //  1. State - When an application first sets state we'll request a new draw context to use.
    443     //     a. If there are no available draw contexts then we'll have to wait until one becomes free.
    444     //     b. If one is available then set pCurDrawContext to point to it and mark it in use.
    445     //     c. All state calls set state on pCurDrawContext.
    446     //  2. Draw - Creates submits a work item that is associated with current draw context.
    447     //     a. Set pPrevDrawContext = pCurDrawContext
    448     //     b. Set pCurDrawContext to NULL.
    449     //  3. State - When an applications sets state after draw
    450     //     a. Same as step 1.
    451     //     b. State is copied from prev draw context to current.
    452     RingBuffer<DRAW_CONTEXT> dcRing;
    453 
    454     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
    455     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
    456 
    457     MacroTileMgr* pMacroTileManagerArray;
    458     DispatchQueue* pDispatchQueueArray;
    459 
    460     // Draw State Ring
    461     //  When draw are very large (lots of primitives) then the API thread will break these up.
    462     //  These split draws all have identical state. So instead of storing the state directly
    463     //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
    464     //  to reference a single entry in the DS ring.
    465     RingBuffer<DRAW_STATE> dsRing;
    466 
    467     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
    468 
    469     uint32_t NumWorkerThreads;
    470     uint32_t NumFEThreads;
    471     uint32_t NumBEThreads;
    472 
    473     THREAD_POOL threadPool; // Thread pool associated with this context
    474     SWR_THREADING_INFO threadInfo;
    475 
    476     std::condition_variable FifosNotEmpty;
    477     std::mutex WaitLock;
    478 
    479     uint32_t privateStateSize;
    480 
    481     HotTileMgr *pHotTileMgr;
    482 
    483     // Callback functions, passed in at create context time
    484     PFN_LOAD_TILE               pfnLoadTile;
    485     PFN_STORE_TILE              pfnStoreTile;
    486     PFN_CLEAR_TILE              pfnClearTile;
    487     PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
    488     PFN_UPDATE_STATS            pfnUpdateStats;
    489     PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
    490 
    491 
    492     // Global Stats
    493     SWR_STATS* pStats;
    494 
    495     // Scratch space for workers.
    496     uint8_t** ppScratch;
    497 
    498     volatile int32_t  drawsOutstandingFE;
    499 
    500     CachingAllocator cachingArenaAllocator;
    501     uint32_t frameCount;
    502 
    503     uint32_t lastFrameChecked;
    504     uint64_t lastDrawChecked;
    505     TileSet singleThreadLockedTiles;
    506 
    507     // ArchRast thread contexts.
    508     HANDLE* pArContext;
    509 };
    510 
    511 #define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; }
    512 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; }
    513 
    514 // ArchRast instrumentation framework
    515 #define AR_WORKER_CTX  pContext->pArContext[workerId]
    516 #define AR_API_CTX     pContext->pArContext[pContext->NumWorkerThreads]
    517 
    518 #ifdef KNOB_ENABLE_AR
    519     #define _AR_BEGIN(ctx, type, id)    ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id))
    520     #define _AR_END(ctx, type, count)   ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count))
    521     #define _AR_EVENT(ctx, event)       ArchRast::Dispatch(ctx, ArchRast::event)
    522 #else
    523     #ifdef KNOB_ENABLE_RDTSC
    524         #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type)
    525         #define _AR_END(ctx, type, id)   RDTSC_STOP(type, id, 0)
    526     #else
    527         #define _AR_BEGIN(ctx, type, id) (void)ctx
    528         #define _AR_END(ctx, type, id)
    529     #endif
    530     #define _AR_EVENT(ctx, event)
    531 #endif
    532 
    533 // Use these macros for api thread.
    534 #define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id)
    535 #define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count)
    536 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
    537 
    538 // Use these macros for worker threads.
    539 #define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id)
    540 #define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count)
    541 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
    542