1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file tilemgr.cpp 24 * 25 * @brief Implementation for Macro Tile Manager which provides the facilities 26 * for threads to work on an macro tile. 27 * 28 ******************************************************************************/ 29 #include <unordered_map> 30 31 #include "fifo.hpp" 32 #include "core/tilemgr.h" 33 #include "core/multisample.h" 34 #include "rdtsc_core.h" 35 36 #define TILE_ID(x,y) ((x << 16 | y)) 37 38 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) 39 { 40 } 41 42 void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) 43 { 44 // Should not enqueue more then what we have backing for in the hot tile manager. 45 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); 46 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); 47 48 if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1))) 49 { 50 return; 51 } 52 53 uint32_t id = TILE_ID(x, y); 54 55 MacroTileQueue &tile = mTiles[id]; 56 tile.mWorkItemsFE++; 57 tile.mId = id; 58 59 if (tile.mWorkItemsFE == 1) 60 { 61 tile.clear(mArena); 62 mDirtyTiles.push_back(&tile); 63 } 64 65 mWorkItemsProduced++; 66 tile.enqueue_try_nosync(mArena, pWork); 67 } 68 69 void MacroTileMgr::markTileComplete(uint32_t id) 70 { 71 SWR_ASSERT(mTiles.find(id) != mTiles.end()); 72 MacroTileQueue &tile = mTiles[id]; 73 uint32_t numTiles = tile.mWorkItemsFE; 74 InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles); 75 76 _ReadWriteBarrier(); 77 tile.mWorkItemsBE += numTiles; 78 SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE); 79 80 // clear out tile, but defer fifo clear until the next DC first queues to it. 81 // this prevents worker threads from constantly locking a completed macro tile 82 tile.mWorkItemsFE = 0; 83 tile.mWorkItemsBE = 0; 84 } 85 86 HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples, 87 uint32_t renderTargetArrayIndex) 88 { 89 uint32_t x, y; 90 MacroTileMgr::getTileIndices(macroID, x, y); 91 92 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); 93 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); 94 95 HotTileSet &tile = mHotTiles[x][y]; 96 HOTTILE& hotTile = tile.Attachment[attachment]; 97 if (hotTile.pBuffer == NULL) 98 { 99 if (create) 100 { 101 uint32_t size = numSamples * mHotTileSize[attachment]; 102 uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); 103 hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode); 104 hotTile.state = HOTTILE_INVALID; 105 hotTile.numSamples = numSamples; 106 hotTile.renderTargetArrayIndex = renderTargetArrayIndex; 107 } 108 else 109 { 110 return NULL; 111 } 112 } 113 else 114 { 115 // free the old tile and create a new one with enough space to hold all samples 116 if (numSamples > hotTile.numSamples) 117 { 118 // tile should be either uninitialized or resolved if we're deleting and switching to a 119 // new sample count 120 SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || 121 (hotTile.state == HOTTILE_RESOLVED) || 122 (hotTile.state == HOTTILE_CLEAR)); 123 FreeHotTileMem(hotTile.pBuffer); 124 125 uint32_t size = numSamples * mHotTileSize[attachment]; 126 uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); 127 hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode); 128 hotTile.state = HOTTILE_INVALID; 129 hotTile.numSamples = numSamples; 130 } 131 132 // if requested render target array index isn't currently loaded, need to store out the current hottile 133 // and load the requested array slice 134 if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) 135 { 136 SWR_FORMAT format; 137 switch (attachment) 138 { 139 case SWR_ATTACHMENT_COLOR0: 140 case SWR_ATTACHMENT_COLOR1: 141 case SWR_ATTACHMENT_COLOR2: 142 case SWR_ATTACHMENT_COLOR3: 143 case SWR_ATTACHMENT_COLOR4: 144 case SWR_ATTACHMENT_COLOR5: 145 case SWR_ATTACHMENT_COLOR6: 146 case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; 147 case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; 148 case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; 149 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; 150 } 151 152 if (hotTile.state == HOTTILE_CLEAR) 153 { 154 if (attachment == SWR_ATTACHMENT_STENCIL) 155 ClearStencilHotTile(&hotTile); 156 else if (attachment == SWR_ATTACHMENT_DEPTH) 157 ClearDepthHotTile(&hotTile); 158 else 159 ClearColorHotTile(&hotTile); 160 161 hotTile.state = HOTTILE_DIRTY; 162 } 163 164 if (hotTile.state == HOTTILE_DIRTY) 165 { 166 pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment, 167 x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); 168 } 169 170 pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment, 171 x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); 172 173 hotTile.renderTargetArrayIndex = renderTargetArrayIndex; 174 hotTile.state = HOTTILE_DIRTY; 175 } 176 } 177 return &tile.Attachment[attachment]; 178 } 179 180 HOTTILE* HotTileMgr::GetHotTileNoLoad( 181 SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, 182 SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples) 183 { 184 uint32_t x, y; 185 MacroTileMgr::getTileIndices(macroID, x, y); 186 187 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); 188 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); 189 190 HotTileSet &tile = mHotTiles[x][y]; 191 HOTTILE& hotTile = tile.Attachment[attachment]; 192 if (hotTile.pBuffer == NULL) 193 { 194 if (create) 195 { 196 uint32_t size = numSamples * mHotTileSize[attachment]; 197 hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, KNOB_SIMD_WIDTH * 4); 198 hotTile.state = HOTTILE_INVALID; 199 hotTile.numSamples = numSamples; 200 hotTile.renderTargetArrayIndex = 0; 201 } 202 else 203 { 204 return NULL; 205 } 206 } 207 208 return &hotTile; 209 } 210 211 #if USE_8x2_TILE_BACKEND 212 void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. 213 { 214 // Load clear color into SIMD register... 215 float *pClearData = (float *)(pHotTile->clearData); 216 simd16scalar valR = _simd16_broadcast_ss(&pClearData[0]); 217 simd16scalar valG = _simd16_broadcast_ss(&pClearData[1]); 218 simd16scalar valB = _simd16_broadcast_ss(&pClearData[2]); 219 simd16scalar valA = _simd16_broadcast_ss(&pClearData[3]); 220 221 float *pfBuf = (float *)pHotTile->pBuffer; 222 uint32_t numSamples = pHotTile->numSamples; 223 224 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 225 { 226 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 227 { 228 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM) 229 { 230 _simd16_store_ps(pfBuf, valR); 231 pfBuf += KNOB_SIMD16_WIDTH; 232 233 _simd16_store_ps(pfBuf, valG); 234 pfBuf += KNOB_SIMD16_WIDTH; 235 236 _simd16_store_ps(pfBuf, valB); 237 pfBuf += KNOB_SIMD16_WIDTH; 238 239 _simd16_store_ps(pfBuf, valA); 240 pfBuf += KNOB_SIMD16_WIDTH; 241 } 242 } 243 } 244 } 245 246 void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. 247 { 248 // Load clear color into SIMD register... 249 float *pClearData = (float *)(pHotTile->clearData); 250 simd16scalar valZ = _simd16_broadcast_ss(&pClearData[0]); 251 252 float *pfBuf = (float *)pHotTile->pBuffer; 253 uint32_t numSamples = pHotTile->numSamples; 254 255 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 256 { 257 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 258 { 259 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM) 260 { 261 _simd16_store_ps(pfBuf, valZ); 262 pfBuf += KNOB_SIMD16_WIDTH; 263 } 264 } 265 } 266 } 267 268 void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) 269 { 270 // convert from F32 to U8. 271 uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); 272 //broadcast 32x into __m256i... 273 simd16scalari valS = _simd16_set1_epi8(clearVal); 274 275 simd16scalari *pBuf = (simd16scalari *)pHotTile->pBuffer; 276 uint32_t numSamples = pHotTile->numSamples; 277 278 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 279 { 280 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 281 { 282 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. 283 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4) 284 { 285 _simd16_store_si(pBuf, valS); 286 pBuf += 1; 287 } 288 } 289 } 290 } 291 292 #else 293 void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. 294 { 295 // Load clear color into SIMD register... 296 float *pClearData = (float*)(pHotTile->clearData); 297 simdscalar valR = _simd_broadcast_ss(&pClearData[0]); 298 simdscalar valG = _simd_broadcast_ss(&pClearData[1]); 299 simdscalar valB = _simd_broadcast_ss(&pClearData[2]); 300 simdscalar valA = _simd_broadcast_ss(&pClearData[3]); 301 302 float *pfBuf = (float*)pHotTile->pBuffer; 303 uint32_t numSamples = pHotTile->numSamples; 304 305 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 306 { 307 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 308 { 309 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) 310 { 311 _simd_store_ps(pfBuf, valR); 312 pfBuf += KNOB_SIMD_WIDTH; 313 _simd_store_ps(pfBuf, valG); 314 pfBuf += KNOB_SIMD_WIDTH; 315 _simd_store_ps(pfBuf, valB); 316 pfBuf += KNOB_SIMD_WIDTH; 317 _simd_store_ps(pfBuf, valA); 318 pfBuf += KNOB_SIMD_WIDTH; 319 } 320 } 321 } 322 } 323 324 void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. 325 { 326 // Load clear color into SIMD register... 327 float *pClearData = (float*)(pHotTile->clearData); 328 simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); 329 330 float *pfBuf = (float*)pHotTile->pBuffer; 331 uint32_t numSamples = pHotTile->numSamples; 332 333 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 334 { 335 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 336 { 337 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) 338 { 339 _simd_store_ps(pfBuf, valZ); 340 pfBuf += KNOB_SIMD_WIDTH; 341 } 342 } 343 } 344 } 345 346 void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) 347 { 348 // convert from F32 to U8. 349 uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); 350 //broadcast 32x into __m256i... 351 simdscalari valS = _simd_set1_epi8(clearVal); 352 353 simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; 354 uint32_t numSamples = pHotTile->numSamples; 355 356 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 357 { 358 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 359 { 360 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. 361 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) 362 { 363 _simd_store_si(pBuf, valS); 364 pBuf += 1; 365 } 366 } 367 } 368 } 369 370 #endif 371 ////////////////////////////////////////////////////////////////////////// 372 /// @brief InitializeHotTiles 373 /// for draw calls, we initialize the active hot tiles and perform deferred 374 /// load on them if tile is in invalid state. we do this in the outer thread 375 /// loop instead of inside the draw routine itself mainly for performance, 376 /// to avoid unnecessary setup every triangle 377 /// @todo support deferred clear 378 /// @param pCreateInfo - pointer to creation info. 379 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID) 380 { 381 const API_STATE& state = GetApiState(pDC); 382 383 uint32_t x, y; 384 MacroTileMgr::getTileIndices(macroID, x, y); 385 x *= KNOB_MACROTILE_X_DIM; 386 y *= KNOB_MACROTILE_Y_DIM; 387 388 uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); 389 390 // check RT if enabled 391 unsigned long rtSlot = 0; 392 uint32_t colorHottileEnableMask = state.colorHottileEnable; 393 while (_BitScanForward(&rtSlot, colorHottileEnableMask)) 394 { 395 HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples); 396 397 if (pHotTile->state == HOTTILE_INVALID) 398 { 399 AR_BEGIN(BELoadTiles, pDC->drawId); 400 // invalid hottile before draw requires a load from surface before we can draw to it 401 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); 402 pHotTile->state = HOTTILE_DIRTY; 403 AR_END(BELoadTiles, 0); 404 } 405 else if (pHotTile->state == HOTTILE_CLEAR) 406 { 407 AR_BEGIN(BELoadTiles, pDC->drawId); 408 // Clear the tile. 409 ClearColorHotTile(pHotTile); 410 pHotTile->state = HOTTILE_DIRTY; 411 AR_END(BELoadTiles, 0); 412 } 413 colorHottileEnableMask &= ~(1 << rtSlot); 414 } 415 416 // check depth if enabled 417 if (state.depthHottileEnable) 418 { 419 HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); 420 if (pHotTile->state == HOTTILE_INVALID) 421 { 422 AR_BEGIN(BELoadTiles, pDC->drawId); 423 // invalid hottile before draw requires a load from surface before we can draw to it 424 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); 425 pHotTile->state = HOTTILE_DIRTY; 426 AR_END(BELoadTiles, 0); 427 } 428 else if (pHotTile->state == HOTTILE_CLEAR) 429 { 430 AR_BEGIN(BELoadTiles, pDC->drawId); 431 // Clear the tile. 432 ClearDepthHotTile(pHotTile); 433 pHotTile->state = HOTTILE_DIRTY; 434 AR_END(BELoadTiles, 0); 435 } 436 } 437 438 // check stencil if enabled 439 if (state.stencilHottileEnable) 440 { 441 HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); 442 if (pHotTile->state == HOTTILE_INVALID) 443 { 444 AR_BEGIN(BELoadTiles, pDC->drawId); 445 // invalid hottile before draw requires a load from surface before we can draw to it 446 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); 447 pHotTile->state = HOTTILE_DIRTY; 448 AR_END(BELoadTiles, 0); 449 } 450 else if (pHotTile->state == HOTTILE_CLEAR) 451 { 452 AR_BEGIN(BELoadTiles, pDC->drawId); 453 // Clear the tile. 454 ClearStencilHotTile(pHotTile); 455 pHotTile->state = HOTTILE_DIRTY; 456 AR_END(BELoadTiles, 0); 457 } 458 } 459 } 460