1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file backend.cpp 24 * 25 * @brief Backend handles rasterization, pixel shading and output merger 26 * operations. 27 * 28 ******************************************************************************/ 29 30 #include <smmintrin.h> 31 32 #include "backend.h" 33 #include "depthstencil.h" 34 #include "tilemgr.h" 35 #include "memory/tilingtraits.h" 36 #include "core/multisample.h" 37 38 #include <algorithm> 39 40 typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect); 41 static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; 42 43 ////////////////////////////////////////////////////////////////////////// 44 /// @brief Process compute work. 45 /// @param pDC - pointer to draw context (dispatch). 46 /// @param workerId - The unique worker ID that is assigned to this thread. 47 /// @param threadGroupId - the linear index for the thread group within the dispatch. 48 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer) 49 { 50 SWR_CONTEXT *pContext = pDC->pContext; 51 52 AR_BEGIN(BEDispatch, pDC->drawId); 53 54 const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData(); 55 SWR_ASSERT(pTaskData != nullptr); 56 57 // Ensure spill fill memory has been allocated. 58 size_t spillFillSize = pDC->pState->state.totalSpillFillSize; 59 if (spillFillSize && pSpillFillBuffer == nullptr) 60 { 61 pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES); 62 } 63 64 const API_STATE& state = GetApiState(pDC); 65 66 SWR_CS_CONTEXT csContext{ 0 }; 67 csContext.tileCounter = threadGroupId; 68 csContext.dispatchDims[0] = pTaskData->threadGroupCountX; 69 csContext.dispatchDims[1] = pTaskData->threadGroupCountY; 70 csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; 71 csContext.pTGSM = pContext->ppScratch[workerId]; 72 csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; 73 74 state.pfnCsFunc(GetPrivateState(pDC), &csContext); 75 76 UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup); 77 78 AR_END(BEDispatch, 1); 79 } 80 81 ////////////////////////////////////////////////////////////////////////// 82 /// @brief Process shutdown. 83 /// @param pDC - pointer to draw context (dispatch). 84 /// @param workerId - The unique worker ID that is assigned to this thread. 85 /// @param threadGroupId - the linear index for the thread group within the dispatch. 86 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) 87 { 88 // Dummy function 89 } 90 91 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) 92 { 93 uint32_t x, y; 94 MacroTileMgr::getTileIndices(macroTile, x, y); 95 SWR_ASSERT(x == 0 && y == 0); 96 } 97 98 template<SWR_FORMAT format> 99 void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value) 100 { 101 auto lambda = [&](int32_t comp) 102 { 103 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]); 104 105 pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8); 106 }; 107 108 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); 109 110 for (uint32_t i = 0; i < numIter; ++i) 111 { 112 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda); 113 } 114 } 115 116 #if USE_8x2_TILE_BACKEND 117 template<SWR_FORMAT format> 118 void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value) 119 { 120 auto lambda = [&](int32_t comp) 121 { 122 FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]); 123 124 pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8); 125 }; 126 127 const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM); 128 129 for (uint32_t i = 0; i < numIter; ++i) 130 { 131 UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda); 132 } 133 } 134 135 #endif 136 template<SWR_FORMAT format> 137 INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect) 138 { 139 // convert clear color to hottile format 140 // clear color is in RGBA float/uint32 141 #if USE_8x2_TILE_BACKEND 142 simd16vector vClear; 143 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp) 144 { 145 simd16scalar vComp; 146 vComp = _simd16_load1_ps((const float*)&clear[comp]); 147 if (FormatTraits<format>::isNormalized(comp)) 148 { 149 vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp))); 150 vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp)); 151 } 152 vComp = FormatTraits<format>::pack(comp, vComp); 153 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp; 154 } 155 156 #else 157 simdvector vClear; 158 for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp) 159 { 160 simdscalar vComp; 161 vComp = _simd_load1_ps((const float*)&clear[comp]); 162 if (FormatTraits<format>::isNormalized(comp)) 163 { 164 vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp))); 165 vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); 166 } 167 vComp = FormatTraits<format>::pack(comp, vComp); 168 vClear.v[FormatTraits<format>::swizzle(comp)] = vComp; 169 } 170 171 #endif 172 uint32_t tileX, tileY; 173 MacroTileMgr::getTileIndices(macroTile, tileX, tileY); 174 175 // Init to full macrotile 176 SWR_RECT clearTile = 177 { 178 KNOB_MACROTILE_X_DIM * int32_t(tileX), 179 KNOB_MACROTILE_Y_DIM * int32_t(tileY), 180 KNOB_MACROTILE_X_DIM * int32_t(tileX + 1), 181 KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1), 182 }; 183 184 // intersect with clear rect 185 clearTile &= rect; 186 187 // translate to local hottile origin 188 clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM); 189 190 // Make maximums inclusive (needed for convert to raster tiles) 191 clearTile.xmax -= 1; 192 clearTile.ymax -= 1; 193 194 // convert to raster tiles 195 clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT); 196 clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT); 197 clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT); 198 clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT); 199 200 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); 201 // compute steps between raster tile samples / raster tiles / macro tile rows 202 const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8; 203 const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples; 204 const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep; 205 const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8); 206 207 HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex); 208 uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples; 209 uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples; 210 211 // loop over all raster tiles in the current hot tile 212 for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y) 213 { 214 uint8_t* pRasterTile = pRasterTileRow; 215 for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x) 216 { 217 for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++) 218 { 219 ClearRasterTile<format>(pRasterTile, vClear); 220 pRasterTile += rasterTileSampleStep; 221 } 222 } 223 pRasterTileRow += macroTileRowStep; 224 } 225 226 pHotTile->state = HOTTILE_DIRTY; 227 } 228 229 230 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) 231 { 232 SWR_CONTEXT *pContext = pDC->pContext; 233 234 if (KNOB_FAST_CLEAR) 235 { 236 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; 237 SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; 238 uint32_t numSamples = GetNumSamples(sampleCount); 239 240 SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason. 241 242 AR_BEGIN(BEClear, pDC->drawId); 243 244 if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) 245 { 246 unsigned long rt = 0; 247 uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; 248 while (_BitScanForward(&rt, mask)) 249 { 250 mask &= ~(1 << rt); 251 252 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex); 253 254 // All we want to do here is to mark the hot tile as being in a "needs clear" state. 255 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); 256 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); 257 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); 258 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); 259 pHotTile->state = HOTTILE_CLEAR; 260 } 261 } 262 263 if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) 264 { 265 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex); 266 pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth; 267 pHotTile->state = HOTTILE_CLEAR; 268 } 269 270 if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) 271 { 272 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex); 273 274 pHotTile->clearData[0] = pClear->clearStencil; 275 pHotTile->state = HOTTILE_CLEAR; 276 } 277 278 AR_END(BEClear, 1); 279 } 280 else 281 { 282 // Legacy clear 283 CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; 284 AR_BEGIN(BEClear, pDC->drawId); 285 286 if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) 287 { 288 DWORD clearData[4]; 289 clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); 290 clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); 291 clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); 292 clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); 293 294 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; 295 SWR_ASSERT(pfnClearTiles != nullptr); 296 297 unsigned long rt = 0; 298 uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; 299 while (_BitScanForward(&rt, mask)) 300 { 301 mask &= ~(1 << rt); 302 303 pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); 304 } 305 } 306 307 if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) 308 { 309 DWORD clearData[4]; 310 clearData[0] = *(DWORD*)&pClear->clearDepth; 311 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; 312 SWR_ASSERT(pfnClearTiles != nullptr); 313 314 pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); 315 } 316 317 if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) 318 { 319 DWORD clearData[4]; 320 clearData[0] = pClear->clearStencil; 321 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; 322 323 pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); 324 } 325 326 AR_END(BEClear, 1); 327 } 328 } 329 330 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 331 SWR_RENDERTARGET_ATTACHMENT attachment) 332 { 333 SWR_CONTEXT *pContext = pDC->pContext; 334 335 AR_BEGIN(BEStoreTiles, pDC->drawId); 336 337 SWR_FORMAT srcFormat; 338 switch (attachment) 339 { 340 case SWR_ATTACHMENT_COLOR0: 341 case SWR_ATTACHMENT_COLOR1: 342 case SWR_ATTACHMENT_COLOR2: 343 case SWR_ATTACHMENT_COLOR3: 344 case SWR_ATTACHMENT_COLOR4: 345 case SWR_ATTACHMENT_COLOR5: 346 case SWR_ATTACHMENT_COLOR6: 347 case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; 348 case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; 349 case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; 350 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; 351 } 352 353 uint32_t x, y; 354 MacroTileMgr::getTileIndices(macroTile, x, y); 355 356 // Only need to store the hottile if it's been rendered to... 357 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false); 358 if (pHotTile) 359 { 360 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. 361 if (pHotTile->state == HOTTILE_CLEAR) 362 { 363 PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat]; 364 SWR_ASSERT(pfnClearTiles != nullptr); 365 366 pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect); 367 } 368 369 if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) 370 { 371 int32_t destX = KNOB_MACROTILE_X_DIM * x; 372 int32_t destY = KNOB_MACROTILE_Y_DIM * y; 373 374 pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat, 375 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); 376 } 377 378 379 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) 380 { 381 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; 382 } 383 } 384 AR_END(BEStoreTiles, 1); 385 } 386 387 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) 388 { 389 STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData; 390 391 unsigned long rt = 0; 392 uint32_t mask = pDesc->attachmentMask; 393 while (_BitScanForward(&rt, mask)) 394 { 395 mask &= ~(1 << rt); 396 ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt); 397 } 398 } 399 400 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) 401 { 402 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData; 403 SWR_CONTEXT *pContext = pDC->pContext; 404 405 const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); 406 407 for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i) 408 { 409 if (pDesc->attachmentMask & (1 << i)) 410 { 411 HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad( 412 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples); 413 if (pHotTile) 414 { 415 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState; 416 } 417 } 418 } 419 } 420 421 #if KNOB_SIMD_WIDTH == 8 422 const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; 423 const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; 424 const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; 425 const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; 426 #else 427 #error Unsupported vector width 428 #endif 429 430 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ) 431 { 432 simdscalar vClipMask = _simd_setzero_ps(); 433 uint32_t numClipDistance = _mm_popcnt_u32(clipMask); 434 435 for (uint32_t i = 0; i < numClipDistance; ++i) 436 { 437 // pull triangle clip distance values from clip buffer 438 simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++); 439 simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++); 440 simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++); 441 442 // interpolate 443 simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ); 444 445 // clip if interpolated clip distance is < 0 || NAN 446 simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ); 447 448 vClipMask = _simd_or_ps(vClipMask, vCull); 449 } 450 451 return _simd_movemask_ps(vClipMask); 452 } 453 454 template<typename T> 455 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) 456 { 457 SWR_CONTEXT *pContext = pDC->pContext; 458 459 AR_BEGIN(BESingleSampleBackend, pDC->drawId); 460 AR_BEGIN(BESetup, pDC->drawId); 461 462 const API_STATE &state = GetApiState(pDC); 463 464 BarycentricCoeffs coeffs; 465 SetupBarycentricCoeffs(&coeffs, work); 466 467 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; 468 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); 469 470 SWR_PS_CONTEXT psContext; 471 SetupPixelShaderContext<T>(&psContext, work); 472 473 AR_END(BESetup, 1); 474 475 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); 476 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); 477 478 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); 479 480 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) 481 { 482 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); 483 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); 484 485 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); 486 487 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) 488 { 489 #if USE_8x2_TILE_BACKEND 490 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); 491 492 #endif 493 simdmask coverageMask = work.coverageMask[0] & MASK; 494 495 if (coverageMask) 496 { 497 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) 498 { 499 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); 500 501 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer)); 502 503 const float minz = state.depthBoundsState.depthBoundsTestMinValue; 504 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; 505 506 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); 507 } 508 509 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) 510 { 511 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; 512 513 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); 514 } 515 516 AR_BEGIN(BEBarycentric, pDC->drawId); 517 518 CalcPixelBarycentrics(coeffs, psContext); 519 520 CalcCentroid<T, true>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask); 521 522 // interpolate and quantize z 523 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); 524 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); 525 526 AR_END(BEBarycentric, 1); 527 528 // interpolate user clip distance if available 529 if (state.rastState.clipDistanceMask) 530 { 531 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); 532 } 533 534 simdscalar vCoverageMask = vMask(coverageMask); 535 simdscalar depthPassMask = vCoverageMask; 536 simdscalar stencilPassMask = vCoverageMask; 537 538 // Early-Z? 539 if (T::bCanEarlyZ) 540 { 541 AR_BEGIN(BEEarlyDepthTest, pDC->drawId); 542 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, 543 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); 544 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask))); 545 AR_END(BEEarlyDepthTest, 0); 546 547 // early-exit if no pixels passed depth or earlyZ is forced on 548 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) 549 { 550 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, 551 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); 552 553 if (!_simd_movemask_ps(depthPassMask)) 554 { 555 goto Endtile; 556 } 557 } 558 } 559 560 psContext.sampleIndex = 0; 561 psContext.activeMask = _simd_castps_si(vCoverageMask); 562 563 // execute pixel shader 564 AR_BEGIN(BEPixelShader, pDC->drawId); 565 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); 566 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); 567 AR_END(BEPixelShader, 0); 568 569 vCoverageMask = _simd_castsi_ps(psContext.activeMask); 570 571 // late-Z 572 if (!T::bCanEarlyZ) 573 { 574 AR_BEGIN(BELateDepthTest, pDC->drawId); 575 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, 576 psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); 577 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask))); 578 AR_END(BELateDepthTest, 0); 579 580 if (!_simd_movemask_ps(depthPassMask)) 581 { 582 // need to call depth/stencil write for stencil write 583 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, 584 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); 585 goto Endtile; 586 } 587 } 588 589 uint32_t statMask = _simd_movemask_ps(depthPassMask); 590 uint32_t statCount = _mm_popcnt_u32(statMask); 591 UPDATE_STAT_BE(DepthPassCount, statCount); 592 593 // output merger 594 AR_BEGIN(BEOutputMerger, pDC->drawId); 595 #if USE_8x2_TILE_BACKEND 596 OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); 597 #else 598 OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); 599 #endif 600 601 // do final depth write after all pixel kills 602 if (!state.psState.forceEarlyZ) 603 { 604 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, 605 pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); 606 } 607 AR_END(BEOutputMerger, 0); 608 } 609 610 Endtile: 611 AR_BEGIN(BEEndTile, pDC->drawId); 612 613 work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 614 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) 615 { 616 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 617 } 618 619 #if USE_8x2_TILE_BACKEND 620 if (useAlternateOffset) 621 { 622 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) 623 { 624 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; 625 } 626 } 627 #else 628 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) 629 { 630 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; 631 } 632 #endif 633 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; 634 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; 635 636 AR_END(BEEndTile, 0); 637 638 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); 639 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); 640 } 641 642 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); 643 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); 644 } 645 646 AR_END(BESingleSampleBackend, 0); 647 } 648 649 template<typename T> 650 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) 651 { 652 SWR_CONTEXT *pContext = pDC->pContext; 653 654 AR_BEGIN(BESampleRateBackend, pDC->drawId); 655 AR_BEGIN(BESetup, pDC->drawId); 656 657 const API_STATE &state = GetApiState(pDC); 658 659 BarycentricCoeffs coeffs; 660 SetupBarycentricCoeffs(&coeffs, work); 661 662 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; 663 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); 664 665 SWR_PS_CONTEXT psContext; 666 SetupPixelShaderContext<T>(&psContext, work); 667 668 AR_END(BESetup, 0); 669 670 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); 671 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); 672 673 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); 674 675 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) 676 { 677 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); 678 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); 679 680 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); 681 682 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) 683 { 684 #if USE_8x2_TILE_BACKEND 685 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); 686 687 #endif 688 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) 689 { 690 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; 691 692 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); 693 } 694 695 AR_BEGIN(BEBarycentric, pDC->drawId); 696 697 CalcPixelBarycentrics(coeffs, psContext); 698 699 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask); 700 701 AR_END(BEBarycentric, 0); 702 703 for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++) 704 { 705 simdmask coverageMask = work.coverageMask[sample] & MASK; 706 707 if (coverageMask) 708 { 709 // offset depth/stencil buffers current sample 710 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); 711 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); 712 713 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) 714 { 715 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); 716 717 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); 718 719 const float minz = state.depthBoundsState.depthBoundsTestMinValue; 720 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; 721 722 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); 723 } 724 725 AR_BEGIN(BEBarycentric, pDC->drawId); 726 727 // calculate per sample positions 728 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample)); 729 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample)); 730 731 CalcSampleBarycentrics(coeffs, psContext); 732 733 // interpolate and quantize z 734 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); 735 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); 736 737 AR_END(BEBarycentric, 0); 738 739 // interpolate user clip distance if available 740 if (state.rastState.clipDistanceMask) 741 { 742 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); 743 } 744 745 simdscalar vCoverageMask = vMask(coverageMask); 746 simdscalar depthPassMask = vCoverageMask; 747 simdscalar stencilPassMask = vCoverageMask; 748 749 // Early-Z? 750 if (T::bCanEarlyZ) 751 { 752 AR_BEGIN(BEEarlyDepthTest, pDC->drawId); 753 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, 754 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); 755 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask))); 756 AR_END(BEEarlyDepthTest, 0); 757 758 // early-exit if no samples passed depth or earlyZ is forced on. 759 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) 760 { 761 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, 762 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); 763 764 if (!_simd_movemask_ps(depthPassMask)) 765 { 766 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 767 continue; 768 } 769 } 770 } 771 772 psContext.sampleIndex = sample; 773 psContext.activeMask = _simd_castps_si(vCoverageMask); 774 775 // execute pixel shader 776 AR_BEGIN(BEPixelShader, pDC->drawId); 777 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); 778 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); 779 AR_END(BEPixelShader, 0); 780 781 vCoverageMask = _simd_castsi_ps(psContext.activeMask); 782 783 // late-Z 784 if (!T::bCanEarlyZ) 785 { 786 AR_BEGIN(BELateDepthTest, pDC->drawId); 787 depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, 788 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); 789 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask))); 790 AR_END(BELateDepthTest, 0); 791 792 if (!_simd_movemask_ps(depthPassMask)) 793 { 794 // need to call depth/stencil write for stencil write 795 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, 796 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); 797 798 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 799 continue; 800 } 801 } 802 803 uint32_t statMask = _simd_movemask_ps(depthPassMask); 804 uint32_t statCount = _mm_popcnt_u32(statMask); 805 UPDATE_STAT_BE(DepthPassCount, statCount); 806 807 // output merger 808 AR_BEGIN(BEOutputMerger, pDC->drawId); 809 #if USE_8x2_TILE_BACKEND 810 OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); 811 #else 812 OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); 813 #endif 814 815 // do final depth write after all pixel kills 816 if (!state.psState.forceEarlyZ) 817 { 818 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, 819 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); 820 } 821 AR_END(BEOutputMerger, 0); 822 } 823 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 824 } 825 826 Endtile: 827 ATTR_UNUSED; 828 829 AR_BEGIN(BEEndTile, pDC->drawId); 830 831 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) 832 { 833 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 834 } 835 836 #if USE_8x2_TILE_BACKEND 837 if (useAlternateOffset) 838 { 839 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) 840 { 841 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; 842 } 843 } 844 #else 845 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) 846 { 847 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; 848 } 849 #endif 850 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; 851 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; 852 853 AR_END(BEEndTile, 0); 854 855 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); 856 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); 857 } 858 859 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); 860 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); 861 } 862 863 AR_END(BESampleRateBackend, 0); 864 } 865 866 template<typename T> 867 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) 868 { 869 SWR_CONTEXT *pContext = pDC->pContext; 870 871 AR_BEGIN(BEPixelRateBackend, pDC->drawId); 872 AR_BEGIN(BESetup, pDC->drawId); 873 874 const API_STATE &state = GetApiState(pDC); 875 876 BarycentricCoeffs coeffs; 877 SetupBarycentricCoeffs(&coeffs, work); 878 879 uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; 880 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); 881 882 SWR_PS_CONTEXT psContext; 883 SetupPixelShaderContext<T>(&psContext, work); 884 885 AR_END(BESetup, 0); 886 887 PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask); 888 889 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); 890 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); 891 892 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); 893 894 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) 895 { 896 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); 897 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); 898 899 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); 900 901 for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) 902 { 903 #if USE_8x2_TILE_BACKEND 904 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); 905 906 #endif 907 simdscalar activeLanes; 908 if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; 909 activeLanes = vMask(work.anyCoveredSamples & MASK); 910 911 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) 912 { 913 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; 914 915 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); 916 } 917 918 AR_BEGIN(BEBarycentric, pDC->drawId); 919 920 CalcPixelBarycentrics(coeffs, psContext); 921 922 CalcCentroid<T, false>(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask); 923 924 AR_END(BEBarycentric, 0); 925 926 if(T::bForcedSampleCount) 927 { 928 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set 929 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si())); 930 activeLanes = _simd_and_ps(activeLanes, vSampleMask); 931 } 932 933 // Early-Z? 934 if(T::bCanEarlyZ && !T::bForcedSampleCount) 935 { 936 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest); 937 UPDATE_STAT_BE(DepthPassCount, depthPassCount); 938 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); 939 } 940 941 // if we have no covered samples that passed depth at this point, go to next tile 942 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; 943 944 if(state.psState.usesSourceDepth) 945 { 946 AR_BEGIN(BEBarycentric, pDC->drawId); 947 // interpolate and quantize z 948 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); 949 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); 950 AR_END(BEBarycentric, 0); 951 } 952 953 // pixels that are currently active 954 psContext.activeMask = _simd_castps_si(activeLanes); 955 psContext.oMask = T::MultisampleT::FullSampleMask(); 956 957 // execute pixel shader 958 AR_BEGIN(BEPixelShader, pDC->drawId); 959 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); 960 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes))); 961 AR_END(BEPixelShader, 0); 962 963 // update active lanes to remove any discarded or oMask'd pixels 964 activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si()))); 965 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; 966 967 // late-Z 968 if(!T::bCanEarlyZ && !T::bForcedSampleCount) 969 { 970 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest); 971 UPDATE_STAT_BE(DepthPassCount, depthPassCount); 972 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes))); 973 } 974 975 // if we have no covered samples that passed depth at this point, skip OM and go to next tile 976 if(!_simd_movemask_ps(activeLanes)) { goto Endtile; }; 977 978 // output merger 979 // loop over all samples, broadcasting the results of the PS to all passing pixels 980 for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++) 981 { 982 AR_BEGIN(BEOutputMerger, pDC->drawId); 983 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples 984 uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0; 985 simdscalar coverageMask, depthMask; 986 if(T::bForcedSampleCount) 987 { 988 coverageMask = depthMask = activeLanes; 989 } 990 else 991 { 992 coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum]; 993 depthMask = PixelRateZTest.depthPassMask[coverageSampleNum]; 994 if(!_simd_movemask_ps(depthMask)) 995 { 996 // stencil should already have been written in early/lateZ tests 997 AR_END(BEOutputMerger, 0); 998 continue; 999 } 1000 } 1001 1002 // broadcast the results of the PS to all passing pixels 1003 #if USE_8x2_TILE_BACKEND 1004 OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); 1005 #else 1006 OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); 1007 #endif 1008 1009 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) 1010 { 1011 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); 1012 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); 1013 1014 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], 1015 pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); 1016 } 1017 AR_END(BEOutputMerger, 0); 1018 } 1019 Endtile: 1020 AR_BEGIN(BEEndTile, pDC->drawId); 1021 1022 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) 1023 { 1024 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 1025 } 1026 1027 if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) 1028 { 1029 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 1030 } 1031 work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 1032 1033 #if USE_8x2_TILE_BACKEND 1034 if (useAlternateOffset) 1035 { 1036 for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) 1037 { 1038 pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; 1039 } 1040 } 1041 #else 1042 for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) 1043 { 1044 pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; 1045 } 1046 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; 1047 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; 1048 #endif 1049 1050 AR_END(BEEndTile, 0); 1051 1052 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); 1053 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); 1054 } 1055 1056 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); 1057 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); 1058 } 1059 1060 AR_END(BEPixelRateBackend, 0); 1061 } 1062 // optimized backend flow with NULL PS 1063 template<uint32_t sampleCountT> 1064 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) 1065 { 1066 SWR_CONTEXT *pContext = pDC->pContext; 1067 1068 AR_BEGIN(BENullBackend, pDC->drawId); 1069 ///@todo: handle center multisample pattern 1070 typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T; 1071 AR_BEGIN(BESetup, pDC->drawId); 1072 1073 const API_STATE &state = GetApiState(pDC); 1074 1075 BarycentricCoeffs coeffs; 1076 SetupBarycentricCoeffs(&coeffs, work); 1077 1078 uint8_t *pDepthBuffer, *pStencilBuffer; 1079 SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers); 1080 1081 SWR_PS_CONTEXT psContext; 1082 // skip SetupPixelShaderContext(&psContext, ...); // not needed here 1083 1084 AR_END(BESetup, 0); 1085 1086 simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); 1087 1088 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); 1089 1090 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) 1091 { 1092 simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); 1093 1094 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); 1095 1096 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) 1097 { 1098 // iterate over active samples 1099 unsigned long sample = 0; 1100 uint32_t sampleMask = state.blendState.sampleMask; 1101 while (_BitScanForward(&sample, sampleMask)) 1102 { 1103 sampleMask &= ~(1 << sample); 1104 1105 simdmask coverageMask = work.coverageMask[sample] & MASK; 1106 1107 if (coverageMask) 1108 { 1109 // offset depth/stencil buffers current sample 1110 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); 1111 uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); 1112 1113 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) 1114 { 1115 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); 1116 1117 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); 1118 1119 const float minz = state.depthBoundsState.depthBoundsTestMinValue; 1120 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; 1121 1122 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); 1123 } 1124 1125 AR_BEGIN(BEBarycentric, pDC->drawId); 1126 1127 // calculate per sample positions 1128 psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample)); 1129 psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample)); 1130 1131 CalcSampleBarycentrics(coeffs, psContext); 1132 1133 // interpolate and quantize z 1134 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); 1135 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); 1136 1137 AR_END(BEBarycentric, 0); 1138 1139 // interpolate user clip distance if available 1140 if (state.rastState.clipDistanceMask) 1141 { 1142 coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); 1143 } 1144 1145 simdscalar vCoverageMask = vMask(coverageMask); 1146 simdscalar stencilPassMask = vCoverageMask; 1147 1148 AR_BEGIN(BEEarlyDepthTest, pDC->drawId); 1149 simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, 1150 psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); 1151 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(vCoverageMask), _simd_movemask_ps(stencilPassMask))); 1152 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, 1153 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); 1154 AR_END(BEEarlyDepthTest, 0); 1155 1156 uint32_t statMask = _simd_movemask_ps(depthPassMask); 1157 uint32_t statCount = _mm_popcnt_u32(statMask); 1158 UPDATE_STAT_BE(DepthPassCount, statCount); 1159 } 1160 1161 Endtile: 1162 ATTR_UNUSED; 1163 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); 1164 } 1165 1166 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; 1167 pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; 1168 1169 vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx); 1170 } 1171 1172 vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy); 1173 } 1174 1175 AR_END(BENullBackend, 0); 1176 } 1177 1178 void InitClearTilesTable() 1179 { 1180 memset(sClearTilesTable, 0, sizeof(sClearTilesTable)); 1181 1182 sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>; 1183 sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>; 1184 sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>; 1185 sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>; 1186 sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>; 1187 } 1188 1189 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; 1190 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT] 1191 [2] // centroid 1192 [2] // canEarlyZ 1193 = {}; 1194 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] 1195 [SWR_MSAA_SAMPLE_PATTERN_COUNT] 1196 [SWR_INPUT_COVERAGE_COUNT] 1197 [2] // centroid 1198 [2] // forcedSampleCount 1199 [2] // canEarlyZ 1200 = {}; 1201 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT] 1202 [SWR_INPUT_COVERAGE_COUNT] 1203 [2] // centroid 1204 [2] // canEarlyZ 1205 = {}; 1206 1207 // Recursive template used to auto-nest conditionals. Converts dynamic enum function 1208 // arguments to static template arguments. 1209 template <uint32_t... ArgsT> 1210 struct BEChooser 1211 { 1212 // Last Arg Terminator 1213 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg) 1214 { 1215 switch(tArg) 1216 { 1217 case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break; 1218 case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break; 1219 case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break; 1220 default: 1221 SWR_ASSERT(0 && "Invalid backend func\n"); 1222 return nullptr; 1223 break; 1224 } 1225 } 1226 1227 // Recursively parse args 1228 template <typename... TArgsT> 1229 static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs) 1230 { 1231 switch(tArg) 1232 { 1233 case SWR_MSAA_CENTER_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_CENTER_PATTERN>::GetFunc(remainingArgs...); break; 1234 case SWR_MSAA_STANDARD_PATTERN: return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); break; 1235 default: 1236 SWR_ASSERT(0 && "Invalid sample pattern\n"); 1237 return BEChooser<ArgsT..., SWR_MSAA_STANDARD_PATTERN>::GetFunc(remainingArgs...); 1238 break; 1239 } 1240 } 1241 1242 // Recursively parse args 1243 template <typename... TArgsT> 1244 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) 1245 { 1246 switch(tArg) 1247 { 1248 case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break; 1249 case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break; 1250 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break; 1251 default: 1252 SWR_ASSERT(0 && "Invalid sample pattern\n"); 1253 return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); 1254 break; 1255 } 1256 } 1257 1258 // Recursively parse args 1259 template <typename... TArgsT> 1260 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) 1261 { 1262 switch(tArg) 1263 { 1264 case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break; 1265 case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break; 1266 case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break; 1267 case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break; 1268 case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break; 1269 default: 1270 SWR_ASSERT(0 && "Invalid sample count\n"); 1271 return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); 1272 break; 1273 } 1274 } 1275 1276 // Recursively parse args 1277 template <typename... TArgsT> 1278 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs) 1279 { 1280 if(tArg == true) 1281 { 1282 return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...); 1283 } 1284 1285 return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...); 1286 } 1287 }; 1288 1289 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]) 1290 { 1291 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) 1292 { 1293 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) 1294 { 1295 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) 1296 { 1297 table[inputCoverage][isCentroid][canEarlyZ] = 1298 BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage, 1299 (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE); 1300 } 1301 } 1302 } 1303 } 1304 1305 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2]) 1306 { 1307 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) 1308 { 1309 for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++) 1310 { 1311 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) 1312 { 1313 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) 1314 { 1315 for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++) 1316 { 1317 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) 1318 { 1319 table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] = 1320 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage, 1321 (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE); 1322 } 1323 } 1324 } 1325 } 1326 } 1327 } 1328 } 1329 1330 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]) 1331 { 1332 for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) 1333 { 1334 for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) 1335 { 1336 for(uint32_t centroid = 0; centroid < 2; centroid++) 1337 { 1338 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) 1339 { 1340 table[sampleCount][inputCoverage][centroid][canEarlyZ] = 1341 BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage, 1342 (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); 1343 } 1344 } 1345 } 1346 } 1347 } 1348 1349 void InitBackendFuncTables() 1350 { 1351 InitBackendSingleFuncTable(gBackendSingleSample); 1352 InitBackendPixelFuncTable(gBackendPixelRateTable); 1353 InitBackendSampleFuncTable(gBackendSampleRateTable); 1354 1355 gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ; 1356 gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ; 1357 gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ; 1358 gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ; 1359 gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ; 1360 } 1361