1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file backend.cpp 24 * 25 * @brief Backend handles rasterization, pixel shading and output merger 26 * operations. 27 * 28 ******************************************************************************/ 29 30 #include <smmintrin.h> 31 32 #include "backend.h" 33 #include "backend_impl.h" 34 #include "tilemgr.h" 35 #include "memory/tilingtraits.h" 36 #include "core/multisample.h" 37 #include "backends/gen_BackendPixelRate.hpp" 38 39 #include <algorithm> 40 41 42 ////////////////////////////////////////////////////////////////////////// 43 /// @brief Process compute work. 44 /// @param pDC - pointer to draw context (dispatch). 45 /// @param workerId - The unique worker ID that is assigned to this thread. 46 /// @param threadGroupId - the linear index for the thread group within the dispatch. 47 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace) 48 { 49 SWR_CONTEXT *pContext = pDC->pContext; 50 51 AR_BEGIN(BEDispatch, pDC->drawId); 52 53 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData(); 54 SWR_ASSERT(pTaskData != nullptr); 55 56 // Ensure spill fill memory has been allocated. 57 size_t spillFillSize = pDC->pState->state.totalSpillFillSize; 58 if (spillFillSize && pSpillFillBuffer == nullptr) 59 { 60 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES); 61 } 62 63 size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances; 64 if (scratchSpaceSize && pScratchSpace == nullptr) 65 { 66 pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES); 67 } 68 69 const API_STATE& state = GetApiState(pDC); 70 71 SWR_CS_CONTEXT csContext{ 0 }; 72 csContext.tileCounter = threadGroupId; 73 csContext.dispatchDims[0] = pTaskData->threadGroupCountX; 74 csContext.dispatchDims[1] = pTaskData->threadGroupCountY; 75 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; 76 csContext.pTGSM = pContext->ppScratch[workerId]; 77 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; 78 csContext.pScratchSpace = (uint8_t*)pScratchSpace; 79 csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize; 80 81 state.pfnCsFunc(GetPrivateState(pDC), &csContext); 82 83 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup); 84 85 AR_END(BEDispatch, 1); 86 } 87 88 ////////////////////////////////////////////////////////////////////////// 89 /// @brief Process shutdown. 90 /// @param pDC - pointer to draw context (dispatch). 91 /// @param workerId - The unique worker ID that is assigned to this thread. 92 /// @param threadGroupId - the linear index for the thread group within the dispatch. 93 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) 94 { 95 // Dummy function 96 } 97 98 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) 99 { 100 uint32_t x, y; 101 MacroTileMgr::getTileIndices(macroTile, x, y); 102 SWR_ASSERT(x == 0 && y == 0); 103 } 104 105 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 106 SWR_RENDERTARGET_ATTACHMENT attachment) 107 { 108 SWR_CONTEXT *pContext = pDC->pContext; 109 110 AR_BEGIN(BEStoreTiles, pDC->drawId); 111 112 SWR_FORMAT srcFormat; 113 switch (attachment) 114 { 115 case SWR_ATTACHMENT_COLOR0: 116 case SWR_ATTACHMENT_COLOR1: 117 case SWR_ATTACHMENT_COLOR2: 118 case SWR_ATTACHMENT_COLOR3: 119 case SWR_ATTACHMENT_COLOR4: 120 case SWR_ATTACHMENT_COLOR5: 121 case SWR_ATTACHMENT_COLOR6: 122 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; 123 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; 124 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; 125 default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; 126 } 127 128 uint32_t x, y; 129 MacroTileMgr::getTileIndices(macroTile, x, y); 130 131 // Only need to store the hottile if it's been rendered to... 132 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false); 133 if (pHotTile) 134 { 135 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. 136 if (pHotTile->state == HOTTILE_CLEAR) 137 { 138 PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat]; 139 SWR_ASSERT(pfnClearTiles != nullptr); 140 141 pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect); 142 } 143 144 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) 145 { 146 int32_t destX = KNOB_MACROTILE_X_DIM * x; 147 int32_t destY = KNOB_MACROTILE_Y_DIM * y; 148 149 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat, 150 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); 151 } 152 153 154 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) 155 { 156 if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED)) 157 { 158 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; 159 } 160 } 161 } 162 AR_END(BEStoreTiles, 1); 163 } 164 165 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) 166 { 167 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData; 168 169 unsigned long rt = 0; 170 uint32_t mask = pDesc->attachmentMask; 171 while (_BitScanForward(&rt, mask)) 172 { 173 mask &= ~(1 << rt); 174 ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt); 175 } 176 } 177 178 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) 179 { 180 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData; 181 SWR_CONTEXT *pContext = pDC->pContext; 182 183 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); 184 185 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i) 186 { 187 if (pDesc->attachmentMask & (1 << i)) 188 { 189 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad( 190 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples); 191 if (pHotTile) 192 { 193 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState; 194 } 195 } 196 } 197 } 198 199 template<uint32_t sampleCountT> 200 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) 201 { 202 SWR_CONTEXT *pContext = pDC->pContext; 203 204 AR_BEGIN(BENullBackend, pDC->drawId); 205 ///@todo: handle center multisample pattern 206 AR_BEGIN(BESetup, pDC->drawId); 207 208 const API_STATE &state = GetApiState(pDC); 209 210 BarycentricCoeffs coeffs; 211 SetupBarycentricCoeffs(&coeffs, work); 212 213 uint8_t *pDepthBuffer, *pStencilBuffer; 214 SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers); 215 216 SWR_PS_CONTEXT psContext; 217 // skip SetupPixelShaderContext(&psContext, ...); // not needed here 218 219 AR_END(BESetup, 0); 220 221 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); 222 223 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); 224 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; 225 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) 226 { 227 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); 228 229 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); 230 231 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) 232 { 233 // iterate over active samples 234 unsigned long sample = 0; 235 uint32_t sampleMask = state.blendState.sampleMask; 236 while (_BitScanForward(&sample, sampleMask)) 237 { 238 sampleMask &= ~(1 << sample); 239 240 simdmask coverageMask = work.coverageMask[sample] & MASK; 241 242 if (coverageMask) 243 { 244 // offset depth/stencil buffers current sample 245 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); 246 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); 247 248 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) 249 { 250 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); 251 252 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); 253 254 const float minz = state.depthBoundsState.depthBoundsTestMinValue; 255 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; 256 257 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); 258 } 259 260 AR_BEGIN(BEBarycentric, pDC->drawId); 261 262 // calculate per sample positions 263 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample)); 264 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample)); 265 266 CalcSampleBarycentrics(coeffs, psContext); 267 268 // interpolate and quantize z 269 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); 270 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); 271 272 AR_END(BEBarycentric, 0); 273 274 // interpolate user clip distance if available 275 if (state.backendState.clipDistanceMask) 276 { 277 coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); 278 } 279 280 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); 281 simdscalar stencilPassMask = vCoverageMask; 282 283 AR_BEGIN(BEEarlyDepthTest, pDC->drawId); 284 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, 285 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); 286 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); 287 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, 288 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); 289 AR_END(BEEarlyDepthTest, 0); 290 291 uint32_t statMask = _simd_movemask_ps(depthPassMask); 292 uint32_t statCount = _mm_popcnt_u32(statMask); 293 UPDATE_STAT_BE(DepthPassCount, statCount); 294 } 295 296 Endtile: 297 ATTR_UNUSED; 298 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 299 } 300 301 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; 302 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; 303 304 vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx); 305 } 306 307 vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy); 308 } 309 310 AR_END(BENullBackend, 0); 311 } 312 313 PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {}; 314 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; 315 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT] 316 [2] // centroid 317 [2] // canEarlyZ 318 = {}; 319 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] 320 [2] // isCenterPattern 321 [SWR_INPUT_COVERAGE_COUNT] 322 [2] // centroid 323 [2] // forcedSampleCount 324 [2] // canEarlyZ 325 = {}; 326 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT] 327 [SWR_INPUT_COVERAGE_COUNT] 328 [2] // centroid 329 [2] // canEarlyZ 330 = {}; 331 332 void InitBackendFuncTables() 333 { 334 InitBackendPixelRate(); 335 InitBackendSingleFuncTable(gBackendSingleSample); 336 InitBackendSampleFuncTable(gBackendSampleRateTable); 337 338 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ; 339 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ; 340 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ; 341 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ; 342 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ; 343 } 344