1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file backend.h 24 * 25 * @brief Backend handles rasterization, pixel shading and output merger 26 * operations. 27 * 28 ******************************************************************************/ 29 #pragma once 30 31 #include "common/os.h" 32 #include "core/context.h" 33 #include "core/multisample.h" 34 #include "rdtsc_core.h" 35 36 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer); 37 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); 38 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); 39 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); 40 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); 41 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); 42 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); 43 void InitClearTilesTable(); 44 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ); 45 void InitBackendFuncTables(); 46 void InitCPSFuncTables(); 47 void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext); 48 49 enum SWR_BACKEND_FUNCS 50 { 51 SWR_BACKEND_SINGLE_SAMPLE, 52 SWR_BACKEND_MSAA_PIXEL_RATE, 53 SWR_BACKEND_MSAA_SAMPLE_RATE, 54 SWR_BACKEND_FUNCS_MAX, 55 }; 56 57 #if KNOB_SIMD_WIDTH == 8 58 extern const __m256 vCenterOffsetsX; 59 extern const __m256 vCenterOffsetsY; 60 extern const __m256 vULOffsetsX; 61 extern const __m256 vULOffsetsY; 62 #define MASK 0xff 63 #endif 64 65 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) 66 { 67 static const uint32_t RasterTileColorOffsets[16] 68 { 0, 69 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8), 70 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2, 71 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3, 72 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4, 73 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5, 74 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6, 75 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7, 76 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8, 77 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9, 78 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10, 79 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11, 80 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12, 81 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13, 82 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14, 83 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15, 84 }; 85 assert(sampleNum < 16); 86 return RasterTileColorOffsets[sampleNum]; 87 } 88 89 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) 90 { 91 static const uint32_t RasterTileDepthOffsets[16] 92 { 0, 93 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8), 94 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2, 95 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3, 96 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4, 97 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5, 98 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6, 99 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7, 100 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8, 101 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9, 102 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10, 103 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11, 104 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12, 105 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13, 106 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14, 107 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15, 108 }; 109 assert(sampleNum < 16); 110 return RasterTileDepthOffsets[sampleNum]; 111 } 112 113 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) 114 { 115 static const uint32_t RasterTileStencilOffsets[16] 116 { 0, 117 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8), 118 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2, 119 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3, 120 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4, 121 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5, 122 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6, 123 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7, 124 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8, 125 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9, 126 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10, 127 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11, 128 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12, 129 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13, 130 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14, 131 (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15, 132 }; 133 assert(sampleNum < 16); 134 return RasterTileStencilOffsets[sampleNum]; 135 } 136 137 template<typename T, uint32_t InputCoverage> 138 struct generateInputCoverage 139 { 140 INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) 141 { 142 // will need to update for avx512 143 assert(KNOB_SIMD_WIDTH == 8); 144 145 __m256i mask[2]; 146 __m256i sampleCoverage[2]; 147 if(T::bIsStandardPattern) 148 { 149 __m256i src = _mm256_set1_epi32(0); 150 __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; 151 152 if(T::MultisampleT::numSamples == 1) 153 { 154 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); 155 } 156 else if(T::MultisampleT::numSamples == 2) 157 { 158 mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); 159 } 160 else if(T::MultisampleT::numSamples == 4) 161 { 162 mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); 163 } 164 else if(T::MultisampleT::numSamples == 8) 165 { 166 mask[0] = _mm256_set1_epi32(-1); 167 } 168 else if(T::MultisampleT::numSamples == 16) 169 { 170 mask[0] = _mm256_set1_epi32(-1); 171 mask[1] = _mm256_set1_epi32(-1); 172 index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); 173 } 174 175 // gather coverage for samples 0-7 176 sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); 177 if(T::MultisampleT::numSamples > 8) 178 { 179 // gather coverage for samples 8-15 180 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); 181 } 182 } 183 else 184 { 185 // center coverage is the same for all samples; just broadcast to the sample slots 186 uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); 187 if(T::MultisampleT::numSamples == 1) 188 { 189 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); 190 } 191 else if(T::MultisampleT::numSamples == 2) 192 { 193 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); 194 } 195 else if(T::MultisampleT::numSamples == 4) 196 { 197 sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); 198 } 199 else if(T::MultisampleT::numSamples == 8) 200 { 201 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); 202 } 203 else if(T::MultisampleT::numSamples == 16) 204 { 205 sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); 206 sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); 207 } 208 } 209 210 mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, 211 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); 212 // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane 213 __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); 214 215 __m256i packedCoverage1; 216 if(T::MultisampleT::numSamples > 8) 217 { 218 // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane 219 packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); 220 } 221 222 #if (KNOB_ARCH == KNOB_ARCH_AVX) 223 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 224 __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); 225 __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); 226 packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); 227 228 __m256i packedSampleCoverage; 229 if(T::MultisampleT::numSamples > 8) 230 { 231 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane 232 hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); 233 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); 234 shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); 235 packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); 236 packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); 237 } 238 else 239 { 240 packedSampleCoverage = packedCoverage0; 241 } 242 #else 243 __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); 244 // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 245 packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); 246 247 __m256i packedSampleCoverage; 248 if(T::MultisampleT::numSamples > 8) 249 { 250 permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); 251 // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane 252 packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); 253 254 // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane 255 packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); 256 } 257 else 258 { 259 packedSampleCoverage = packedCoverage0; 260 } 261 #endif 262 263 for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) 264 { 265 // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 266 inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); 267 268 if(!T::bForcedSampleCount) 269 { 270 // input coverage has to be anded with sample mask if MSAA isn't forced on 271 inputMask[i] &= sampleMask; 272 } 273 274 // shift to the next pixel in the 4x2 275 packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); 276 } 277 } 278 279 INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) 280 { 281 uint32_t inputMask[KNOB_SIMD_WIDTH]; 282 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); 283 inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); 284 } 285 286 }; 287 288 template<typename T> 289 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> 290 { 291 INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) 292 { 293 // will need to update for avx512 294 assert(KNOB_SIMD_WIDTH == 8); 295 __m256i vec = _mm256_set1_epi32(coverageMask[0]); 296 const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); 297 vec = _simd_and_si(vec, bit); 298 vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); 299 vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); 300 inputCoverage = _simd_castsi_ps(vec); 301 } 302 303 INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) 304 { 305 uint32_t simdCoverage = (coverageMask[0] & MASK); 306 static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1; 307 for(int i = 0; i < KNOB_SIMD_WIDTH; i++) 308 { 309 // set all samples to covered if conservative coverage mask is set for that pixel 310 inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0; 311 } 312 } 313 }; 314 315 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 316 // Centroid behaves exactly as follows : 317 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 318 // have a sample location there). 319 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 320 // coverage with the SampleMask Rasterizer State. 321 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 322 // evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 323 // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point. 324 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 325 template<typename T> 326 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask, 327 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) 328 { 329 uint32_t inputMask[KNOB_SIMD_WIDTH]; 330 generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); 331 332 // Case (2) - partially covered pixel 333 334 // scan for first covered sample per pixel in the 4x2 span 335 unsigned long sampleNum[KNOB_SIMD_WIDTH]; 336 (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0); 337 (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0); 338 (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0); 339 (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0); 340 (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0); 341 (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0); 342 (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0); 343 (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); 344 345 // look up and set the sample offsets from UL pixel corner for first covered sample 346 __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]), 347 T::MultisampleT::X(sampleNum[6]), 348 T::MultisampleT::X(sampleNum[5]), 349 T::MultisampleT::X(sampleNum[4]), 350 T::MultisampleT::X(sampleNum[3]), 351 T::MultisampleT::X(sampleNum[2]), 352 T::MultisampleT::X(sampleNum[1]), 353 T::MultisampleT::X(sampleNum[0])); 354 355 __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]), 356 T::MultisampleT::Y(sampleNum[6]), 357 T::MultisampleT::Y(sampleNum[5]), 358 T::MultisampleT::Y(sampleNum[4]), 359 T::MultisampleT::Y(sampleNum[3]), 360 T::MultisampleT::Y(sampleNum[2]), 361 T::MultisampleT::Y(sampleNum[1]), 362 T::MultisampleT::Y(sampleNum[0])); 363 // add sample offset to UL pixel corner 364 vXSample = _simd_add_ps(vXSamplePosUL, vXSample); 365 vYSample = _simd_add_ps(vYSamplePosUL, vYSample); 366 367 // Case (1) and case (3b) - All samples covered or not covered with full SampleMask 368 static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask(); 369 __m256i vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]); 370 __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask); 371 372 static const __m256i vZero = _simd_setzero_si(); 373 const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask); 374 __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero); 375 __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask); 376 __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask); 377 378 __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b); 379 380 // set the centroid position based on results from above 381 psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter)); 382 psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter)); 383 384 // Case (3a) No samples covered and partial sample mask 385 __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask); 386 // sample mask should never be all 0's for this case, but handle it anyways 387 unsigned long firstCoveredSampleMaskSample = 0; 388 (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0); 389 390 __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); 391 392 vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample)); 393 vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample)); 394 395 // blend in case 3a pixel locations 396 psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); 397 psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a)); 398 } 399 400 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext, 401 const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) 402 { 403 // evaluate I,J 404 psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid); 405 psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid); 406 psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet); 407 psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet); 408 409 // interpolate 1/w 410 psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid); 411 } 412 413 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz) 414 { 415 const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz)); 416 const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz)); 417 418 return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask)); 419 } 420 421 template<typename T> 422 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount) 423 { 424 // RT has to be single sample if we're in forcedMSAA mode 425 if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X)) 426 { 427 return 1; 428 } 429 // unless we're forced to single sample, in which case we run the OM at the sample count of the RT 430 else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X)) 431 { 432 return GetNumSamples(blendSampleCount); 433 } 434 // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count 435 else 436 { 437 return T::MultisampleT::numSamples; 438 } 439 } 440 441 inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work) 442 { 443 // broadcast scalars 444 445 coeffs->vIa = _simd_broadcast_ss(&work.I[0]); 446 coeffs->vIb = _simd_broadcast_ss(&work.I[1]); 447 coeffs->vIc = _simd_broadcast_ss(&work.I[2]); 448 449 coeffs->vJa = _simd_broadcast_ss(&work.J[0]); 450 coeffs->vJb = _simd_broadcast_ss(&work.J[1]); 451 coeffs->vJc = _simd_broadcast_ss(&work.J[2]); 452 453 coeffs->vZa = _simd_broadcast_ss(&work.Z[0]); 454 coeffs->vZb = _simd_broadcast_ss(&work.Z[1]); 455 coeffs->vZc = _simd_broadcast_ss(&work.Z[2]); 456 457 coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet); 458 459 coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); 460 coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); 461 coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); 462 } 463 464 inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers) 465 { 466 assert(colorBufferCount <= SWR_NUM_RENDERTARGETS); 467 468 if (pColorBuffer) 469 { 470 for (uint32_t index = 0; index < colorBufferCount; index += 1) 471 { 472 pColorBuffer[index] = renderBuffers.pColor[index]; 473 } 474 } 475 476 if (pDepthBuffer) 477 { 478 *pDepthBuffer = renderBuffers.pDepth; 479 } 480 481 if (pStencilBuffer) 482 { 483 *pStencilBuffer = renderBuffers.pStencil;; 484 } 485 } 486 487 template<typename T> 488 void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_TRIANGLE_DESC &work) 489 { 490 psContext->pAttribs = work.pAttribs; 491 psContext->pPerspAttribs = work.pPerspAttribs; 492 psContext->frontFace = work.triFlags.frontFacing; 493 psContext->primID = work.triFlags.primID; 494 495 // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs 496 psContext->I = work.I; 497 psContext->J = work.J; 498 499 psContext->recipDet = work.recipDet; 500 psContext->pRecipW = work.pRecipW; 501 psContext->pSamplePosX = reinterpret_cast<const float *>(&T::MultisampleT::samplePosX); 502 psContext->pSamplePosY = reinterpret_cast<const float *>(&T::MultisampleT::samplePosY); 503 psContext->rasterizerSampleCount = T::MultisampleT::numSamples; 504 psContext->sampleIndex = 0; 505 } 506 507 template<typename T, bool IsSingleSample> 508 void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask) 509 { 510 if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different 511 { 512 // for 1x case, centroid is pixel center 513 psContext->vX.centroid = psContext->vX.center; 514 psContext->vY.centroid = psContext->vY.center; 515 psContext->vI.centroid = psContext->vI.center; 516 psContext->vJ.centroid = psContext->vJ.center; 517 psContext->vOneOverW.centroid = psContext->vOneOverW.center; 518 } 519 else 520 { 521 if (T::bCentroidPos) 522 { 523 ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid 524 if (T::bIsStandardPattern) 525 { 526 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'.. 527 CalcCentroidPos<T>(*psContext, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL); 528 } 529 else 530 { 531 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f)); 532 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f)); 533 } 534 535 CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL); 536 } 537 else 538 { 539 psContext->vX.centroid = psContext->vX.sample; 540 psContext->vY.centroid = psContext->vY.sample; 541 } 542 } 543 } 544 545 template<typename T> 546 struct PixelRateZTestLoop 547 { 548 PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState, 549 uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) : 550 pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), 551 clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer) {}; 552 553 INLINE 554 uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 555 const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0) 556 { 557 SWR_CONTEXT *pContext = pDC->pContext; 558 559 uint32_t statCount = 0; 560 simdscalar anyDepthSamplePassed = _simd_setzero_ps(); 561 for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++) 562 { 563 const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample]; 564 vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK)); 565 566 if(!_simd_movemask_ps(vCoverageMask[sample])) 567 { 568 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps(); 569 continue; 570 } 571 572 // offset depth/stencil buffers current sample 573 uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); 574 uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); 575 576 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) 577 { 578 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); 579 580 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); 581 582 const float minz = state.depthBoundsState.depthBoundsTestMinValue; 583 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; 584 585 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz))); 586 } 587 588 AR_BEGIN(BEBarycentric, pDC->drawId); 589 590 // calculate per sample positions 591 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample)); 592 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample)); 593 594 // calc I & J per sample 595 CalcSampleBarycentrics(coeffs, psContext); 596 597 if(psState.writesODepth) 598 { 599 // broadcast and test oDepth(psContext.vZ) written from the PS for each sample 600 vZ[sample] = psContext.vZ; 601 } 602 else 603 { 604 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); 605 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); 606 } 607 608 AR_END(BEBarycentric, 0); 609 610 ///@todo: perspective correct vs non-perspective correct clipping? 611 // if clip distances are enabled, we need to interpolate for each sample 612 if(clipDistanceMask) 613 { 614 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); 615 616 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask)); 617 } 618 619 // ZTest for this sample 620 ///@todo Need to uncomment out this bucket. 621 //AR_BEGIN(BEDepthBucket, pDC->drawId); 622 depthPassMask[sample] = vCoverageMask[sample]; 623 stencilPassMask[sample] = vCoverageMask[sample]; 624 depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, 625 vZ[sample], pDepthSample, vCoverageMask[sample], 626 pStencilSample, &stencilPassMask[sample]); 627 //AR_END(BEDepthBucket, 0); 628 629 // early-exit if no pixels passed depth or earlyZ is forced on 630 if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) 631 { 632 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], 633 pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); 634 635 if(!_simd_movemask_ps(depthPassMask[sample])) 636 { 637 continue; 638 } 639 } 640 anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); 641 uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); 642 statCount += _mm_popcnt_u32(statMask); 643 } 644 645 activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes); 646 // return number of samples that passed depth and coverage 647 return statCount; 648 } 649 650 // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite 651 simdscalar vZ[T::MultisampleT::numCoverageSamples]; 652 simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples]; 653 simdscalar depthPassMask[T::MultisampleT::numCoverageSamples]; 654 simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples]; 655 656 private: 657 // functor inputs 658 DRAW_CONTEXT* pDC; 659 uint32_t workerId; 660 661 const SWR_TRIANGLE_DESC& work; 662 const BarycentricCoeffs& coeffs; 663 const API_STATE& state; 664 const SWR_PS_STATE& psState; 665 const uint8_t clipDistanceMask; 666 uint8_t*& pDepthBuffer; 667 uint8_t*& pStencilBuffer; 668 }; 669 670 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) 671 { 672 // evaluate I,J 673 psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center); 674 psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center); 675 psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet); 676 psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet); 677 678 // interpolate 1/w 679 psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center); 680 } 681 682 INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) 683 { 684 // evaluate I,J 685 psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample); 686 psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample); 687 psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet); 688 psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet); 689 690 // interpolate 1/w 691 psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample); 692 } 693 694 // Merge Output to 4x2 SIMD Tile Format 695 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, 696 const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT) 697 { 698 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc 699 const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); 700 simdvector blendOut; 701 702 for(uint32_t rt = 0; rt < NumRT; ++rt) 703 { 704 uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; 705 706 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; 707 // pfnBlendFunc may not update all channels. Initialize with PS output. 708 /// TODO: move this into the blend JIT. 709 blendOut = psContext.shaded[rt]; 710 711 // Blend outputs and update coverage mask for alpha test 712 if(pfnBlendFunc[rt] != nullptr) 713 { 714 pfnBlendFunc[rt]( 715 pBlendState, 716 psContext.shaded[rt], 717 psContext.shaded[1], 718 psContext.shaded[0].w, 719 sample, 720 pColorSample, 721 blendOut, 722 &psContext.oMask, 723 (simdscalari*)&coverageMask); 724 } 725 726 // final write mask 727 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); 728 729 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. 730 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); 731 732 const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); 733 734 // store with color mask 735 if(!pRTBlend->writeDisableRed) 736 { 737 _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); 738 } 739 if(!pRTBlend->writeDisableGreen) 740 { 741 _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); 742 } 743 if(!pRTBlend->writeDisableBlue) 744 { 745 _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); 746 } 747 if(!pRTBlend->writeDisableAlpha) 748 { 749 _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); 750 } 751 } 752 } 753 754 #if USE_8x2_TILE_BACKEND 755 // Merge Output to 8x2 SIMD16 Tile Format 756 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, 757 const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset) 758 { 759 // type safety guaranteed from template instantiation in BEChooser<>::GetFunc 760 uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); 761 762 if (useAlternateOffset) 763 { 764 rasterTileColorOffset += sizeof(simdscalar); 765 } 766 767 simdvector blendSrc; 768 simdvector blendOut; 769 770 uint32_t colorBufferBit = 1; 771 for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1) 772 { 773 simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset); 774 775 const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; 776 // pfnBlendFunc may not update all channels. Initialize with PS output. 777 /// TODO: move this into the blend JIT. 778 blendOut = psContext.shaded[rt]; 779 780 if (colorBufferBit & colorBufferEnableMask) 781 { 782 blendSrc[0] = pColorSample[0]; 783 blendSrc[1] = pColorSample[2]; 784 blendSrc[2] = pColorSample[4]; 785 blendSrc[3] = pColorSample[6]; 786 } 787 788 // Blend outputs and update coverage mask for alpha test 789 if (pfnBlendFunc[rt] != nullptr) 790 { 791 pfnBlendFunc[rt]( 792 pBlendState, 793 psContext.shaded[rt], 794 psContext.shaded[1], 795 psContext.shaded[0].w, 796 sample, 797 reinterpret_cast<uint8_t *>(&blendSrc), 798 blendOut, 799 &psContext.oMask, 800 reinterpret_cast<simdscalari *>(&coverageMask)); 801 } 802 803 // final write mask 804 simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask)); 805 806 ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. 807 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); 808 809 // store with color mask 810 if (!pRTBlend->writeDisableRed) 811 { 812 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x); 813 } 814 if (!pRTBlend->writeDisableGreen) 815 { 816 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y); 817 } 818 if (!pRTBlend->writeDisableBlue) 819 { 820 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z); 821 } 822 if (!pRTBlend->writeDisableAlpha) 823 { 824 _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w); 825 } 826 } 827 } 828 829 #endif 830 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN, 831 uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0> 832 struct SwrBackendTraits 833 { 834 static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN); 835 static const uint32_t InputCoverage = coverage; 836 static const bool bCentroidPos = (centroid == 1); 837 static const bool bForcedSampleCount = (forced == 1); 838 static const bool bCanEarlyZ = (canEarlyZ == 1); 839 typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT; 840 }; 841