1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file rasterizer.cpp 24 * 25 * @brief Implementation for the rasterizer. 26 * 27 ******************************************************************************/ 28 29 #include <vector> 30 #include <algorithm> 31 32 #include "rasterizer.h" 33 #include "rdtsc_core.h" 34 #include "backend.h" 35 #include "utils.h" 36 #include "frontend.h" 37 #include "tilemgr.h" 38 #include "memory/tilingtraits.h" 39 40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2]; 41 42 template <uint32_t numSamples = 1> 43 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex); 44 template <typename RT> 45 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers); 46 template <typename RT> 47 void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); 48 49 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} 50 static const __m256d gMaskToVecpd[] = 51 { 52 MASKTOVEC(0, 0, 0, 0), 53 MASKTOVEC(0, 0, 0, 1), 54 MASKTOVEC(0, 0, 1, 0), 55 MASKTOVEC(0, 0, 1, 1), 56 MASKTOVEC(0, 1, 0, 0), 57 MASKTOVEC(0, 1, 0, 1), 58 MASKTOVEC(0, 1, 1, 0), 59 MASKTOVEC(0, 1, 1, 1), 60 MASKTOVEC(1, 0, 0, 0), 61 MASKTOVEC(1, 0, 0, 1), 62 MASKTOVEC(1, 0, 1, 0), 63 MASKTOVEC(1, 0, 1, 1), 64 MASKTOVEC(1, 1, 0, 0), 65 MASKTOVEC(1, 1, 0, 1), 66 MASKTOVEC(1, 1, 1, 0), 67 MASKTOVEC(1, 1, 1, 1), 68 }; 69 70 struct POS 71 { 72 int32_t x, y; 73 }; 74 75 struct EDGE 76 { 77 double a, b; // a, b edge coefficients in fix8 78 double stepQuadX; // step to adjacent horizontal quad in fix16 79 double stepQuadY; // step to adjacent vertical quad in fix16 80 double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 81 double stepRasterTileY; // step to adjacent vertical raster tile in fix16 82 83 __m256d vQuadOffsets; // offsets for 4 samples of a quad 84 __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile 85 }; 86 87 ////////////////////////////////////////////////////////////////////////// 88 /// @brief rasterize a raster tile partially covered by the triangle 89 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile 90 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) 91 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. 92 /// Used to step between quads when sweeping over the raster tile. 93 template<uint32_t NumEdges, typename EdgeMaskT> 94 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges) 95 { 96 uint64_t coverageMask = 0; 97 98 __m256d vEdges[NumEdges]; 99 __m256d vStepX[NumEdges]; 100 __m256d vStepY[NumEdges]; 101 102 for (uint32_t e = 0; e < NumEdges; ++e) 103 { 104 // Step to the pixel sample locations of the 1st quad 105 vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets); 106 107 // compute step to next quad (mul by 2 in x and y direction) 108 vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX); 109 vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY); 110 } 111 112 // fast unrolled version for 8x8 tile 113 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 114 int edgeMask[NumEdges]; 115 uint64_t mask; 116 117 auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);}; 118 auto update_lambda = [&](int e){mask &= edgeMask[e];}; 119 auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);}; 120 auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);}; 121 auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);}; 122 123 // evaluate which pixels in the quad are covered 124 #define EVAL \ 125 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda); 126 127 // update coverage mask 128 // if edge 0 is degenerate and will be skipped; init the mask 129 #define UPDATE_MASK(bit) \ 130 if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\ 131 mask = 0xf;\ 132 }\ 133 else{\ 134 mask = edgeMask[0]; \ 135 }\ 136 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \ 137 coverageMask |= (mask << bit); 138 139 // step in the +x direction to the next quad 140 #define INCX \ 141 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda); 142 143 // step in the +y direction to the next quad 144 #define INCY \ 145 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda); 146 147 // step in the -x direction to the next quad 148 #define DECX \ 149 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda); 150 151 // sweep 2x2 quad back and forth through the raster tile, 152 // computing coverage masks for the entire tile 153 154 // raster tile 155 // 0 1 2 3 4 5 6 7 156 // x x 157 // x x ------------------> 158 // x x | 159 // <-----------------x x V 160 // .. 161 162 // row 0 163 EVAL; 164 UPDATE_MASK(0); 165 INCX; 166 EVAL; 167 UPDATE_MASK(4); 168 INCX; 169 EVAL; 170 UPDATE_MASK(8); 171 INCX; 172 EVAL; 173 UPDATE_MASK(12); 174 INCY; 175 176 //row 1 177 EVAL; 178 UPDATE_MASK(28); 179 DECX; 180 EVAL; 181 UPDATE_MASK(24); 182 DECX; 183 EVAL; 184 UPDATE_MASK(20); 185 DECX; 186 EVAL; 187 UPDATE_MASK(16); 188 INCY; 189 190 // row 2 191 EVAL; 192 UPDATE_MASK(32); 193 INCX; 194 EVAL; 195 UPDATE_MASK(36); 196 INCX; 197 EVAL; 198 UPDATE_MASK(40); 199 INCX; 200 EVAL; 201 UPDATE_MASK(44); 202 INCY; 203 204 // row 3 205 EVAL; 206 UPDATE_MASK(60); 207 DECX; 208 EVAL; 209 UPDATE_MASK(56); 210 DECX; 211 EVAL; 212 UPDATE_MASK(52); 213 DECX; 214 EVAL; 215 UPDATE_MASK(48); 216 #else 217 uint32_t bit = 0; 218 for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y) 219 { 220 __m256d vStartOfRowEdge[NumEdges]; 221 for (uint32_t e = 0; e < NumEdges; ++e) 222 { 223 vStartOfRowEdge[e] = vEdges[e]; 224 } 225 226 for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x) 227 { 228 int edgeMask[NumEdges]; 229 for (uint32_t e = 0; e < NumEdges; ++e) 230 { 231 edgeMask[e] = _mm256_movemask_pd(vEdges[e]); 232 } 233 234 uint64_t mask = edgeMask[0]; 235 for (uint32_t e = 1; e < NumEdges; ++e) 236 { 237 mask &= edgeMask[e]; 238 } 239 coverageMask |= (mask << bit); 240 241 // step to the next pixel in the x 242 for (uint32_t e = 0; e < NumEdges; ++e) 243 { 244 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); 245 } 246 bit+=4; 247 } 248 249 // step to the next row 250 for (uint32_t e = 0; e < NumEdges; ++e) 251 { 252 vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]); 253 } 254 } 255 #endif 256 return coverageMask; 257 258 } 259 // Top left rule: 260 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge 261 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge 262 // Top left: a sample is in if it is a top or left edge. 263 // Out: !(horizontal && above) = !horizontal && below 264 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right 265 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) 266 { 267 // if vA < 0, vC-- 268 // if vA == 0 && vB < 0, vC-- 269 270 __m256d vEdgeOut = vEdge; 271 __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); 272 273 // if vA < 0 (line is not horizontal and below) 274 int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); 275 276 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) 277 __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); 278 int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); 279 msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); 280 281 // if either of these are true and we're on the line (edge == 0), bump it outside the line 282 vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); 283 } 284 285 ////////////////////////////////////////////////////////////////////////// 286 /// @brief calculates difference in precision between the result of manh 287 /// calculation and the edge precision, based on compile time trait values 288 template<typename RT> 289 constexpr int64_t ManhToEdgePrecisionAdjust() 290 { 291 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, 292 "Inadequate precision of result of manh calculation "); 293 return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value); 294 } 295 296 ////////////////////////////////////////////////////////////////////////// 297 /// @struct adjustEdgeConservative 298 /// @brief Primary template definition used for partially specializing 299 /// the adjustEdgeConservative function. This struct should never 300 /// be instantiated. 301 /// @tparam RT: rasterizer traits 302 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting? 303 template <typename RT, typename ConservativeEdgeOffsetT> 304 struct adjustEdgeConservative 305 { 306 ////////////////////////////////////////////////////////////////////////// 307 /// @brief Performs calculations to adjust each edge of a triangle away 308 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y 309 /// direction. 310 /// 311 /// Uncertainty regions arise from fixed point rounding, which 312 /// can snap a vertex +/- by min fixed point value. 313 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners. 314 /// This allows the rasterizer to test for coverage only at the pixel center, 315 /// instead of having to test individual pixel corners for conservative coverage 316 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) 317 { 318 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away 319 // from the pixel center (in the direction of the edge normal A/B) 320 321 // edge = Ax + Bx + C - (manh/e) 322 // manh = manhattan distance = abs(A) + abs(B) 323 // e = absolute rounding error from snapping from float to fixed point precision 324 325 // 'fixed point' multiply (in double to be avx1 friendly) 326 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example 327 __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); 328 __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), 329 _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); 330 331 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, 332 "Inadequate precision of result of manh calculation "); 333 334 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision 335 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right 336 manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5)); 337 338 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel 339 // this allows the rasterizer to do a single conservative coverage test to see if the primitive 340 // intersects the pixel at all 341 vEdge = _mm256_sub_pd(vEdge, manh); 342 }; 343 }; 344 345 ////////////////////////////////////////////////////////////////////////// 346 /// @brief adjustEdgeConservative specialization where no edge offset is needed 347 template <typename RT> 348 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>> 349 { 350 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {}; 351 }; 352 353 ////////////////////////////////////////////////////////////////////////// 354 /// @brief calculates the distance a degenerate BBox needs to be adjusted 355 /// for conservative rast based on compile time trait values 356 template<typename RT> 357 constexpr int64_t ConservativeScissorOffset() 358 { 359 static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision"); 360 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges 361 typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT; 362 // 1/2 pixel edge offset + conservative offset - degenerateTriangle 363 return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value)); 364 } 365 366 ////////////////////////////////////////////////////////////////////////// 367 /// @brief Performs calculations to adjust each a vector of evaluated edges out 368 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y 369 /// direction. 370 template <typename RT> 371 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge) 372 { 373 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); 374 int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>(); 375 vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh)); 376 }; 377 378 ////////////////////////////////////////////////////////////////////////// 379 /// @brief Performs calculations to adjust each a scalar evaluated edge out 380 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y 381 /// direction. 382 template <typename RT, typename OffsetT> 383 INLINE double adjustScalarEdge(const double a, const double b, const double Edge) 384 { 385 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); 386 int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>(); 387 return (Edge - manh); 388 }; 389 390 ////////////////////////////////////////////////////////////////////////// 391 /// @brief Perform any needed adjustments to evaluated triangle edges 392 template <typename RT, typename EdgeOffsetT> 393 struct adjustEdgesFix16 394 { 395 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) 396 { 397 static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, 398 "Edge equation expected to be in x.16 fixed point"); 399 400 static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled"); 401 402 // need to apply any edge offsets before applying the top-left rule 403 adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge); 404 405 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); 406 } 407 }; 408 409 ////////////////////////////////////////////////////////////////////////// 410 /// @brief Perform top left adjustments to evaluated triangle edges 411 template <typename RT> 412 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>> 413 { 414 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) 415 { 416 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); 417 } 418 }; 419 420 // max(abs(dz/dx), abs(dz,dy) 421 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) 422 { 423 /* 424 // evaluate i,j at (0,0) 425 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; 426 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; 427 428 // evaluate i,j at (1,0) 429 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; 430 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; 431 432 // compute dz/dx 433 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; 434 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; 435 float dzdx = abs(d10 - d00); 436 437 // evaluate i,j at (0,1) 438 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; 439 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; 440 441 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; 442 float dzdy = abs(d01 - d00); 443 */ 444 445 // optimized version of above 446 float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); 447 float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); 448 449 return std::max(dzdx, dzdy); 450 } 451 452 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) 453 { 454 if (pState->depthFormat == R24_UNORM_X8_TYPELESS) 455 { 456 return (1.0f / (1 << 24)); 457 } 458 else if (pState->depthFormat == R16_UNORM) 459 { 460 return (1.0f / (1 << 16)); 461 } 462 else 463 { 464 SWR_ASSERT(pState->depthFormat == R32_FLOAT); 465 466 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) 467 float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); 468 uint32_t zMaxInt = *(uint32_t*)&zMax; 469 zMaxInt &= 0x7f800000; 470 zMax = *(float*)&zMaxInt; 471 472 return zMax * (1.0f / (1 << 23)); 473 } 474 } 475 476 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) 477 { 478 if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) 479 { 480 return 0.0f; 481 } 482 483 float scale = pState->slopeScaledDepthBias; 484 if (scale != 0.0f) 485 { 486 scale *= ComputeMaxDepthSlope(pTri); 487 } 488 489 float bias = pState->depthBias; 490 if (!pState->depthBiasPreAdjusted) 491 { 492 bias *= ComputeBiasFactor(pState, pTri, z); 493 } 494 bias += scale; 495 496 if (pState->depthBiasClamp > 0.0f) 497 { 498 bias = std::min(bias, pState->depthBiasClamp); 499 } 500 else if (pState->depthBiasClamp < 0.0f) 501 { 502 bias = std::max(bias, pState->depthBiasClamp); 503 } 504 505 return bias; 506 } 507 508 // Prevent DCE by writing coverage mask from rasterizer to volatile 509 #if KNOB_ENABLE_TOSS_POINTS 510 __declspec(thread) volatile uint64_t gToss; 511 #endif 512 513 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; 514 // try to avoid _chkstk insertions; make this thread local 515 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib]; 516 517 INLINE 518 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) 519 { 520 edge.a = a; 521 edge.b = b; 522 523 // compute constant steps to adjacent quads 524 edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE)); 525 edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE)); 526 527 // compute constant steps to adjacent raster tiles 528 edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE)); 529 edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE)); 530 531 // compute quad offsets 532 const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); 533 const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); 534 535 __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8); 536 __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8); 537 edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); 538 539 // compute raster tile offsets 540 const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0); 541 const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0); 542 543 __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8); 544 __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8); 545 edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16); 546 } 547 548 INLINE 549 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) 550 { 551 ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge); 552 } 553 554 ////////////////////////////////////////////////////////////////////////// 555 /// @brief Primary template definition used for partially specializing 556 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel 557 /// corner to sample position, and test for coverage 558 /// @tparam sampleCount: multisample count 559 template <typename NumSamplesT> 560 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, 561 int32_t &mask0, int32_t &mask1, int32_t &mask2) 562 { 563 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; 564 // evaluate edge equations at the tile multisample bounding box 565 vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); 566 vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); 567 vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); 568 mask0 = _mm256_movemask_pd(vSampleBboxTest0); 569 mask1 = _mm256_movemask_pd(vSampleBboxTest1); 570 mask2 = _mm256_movemask_pd(vSampleBboxTest2); 571 } 572 573 ////////////////////////////////////////////////////////////////////////// 574 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated 575 /// when only rasterizing a single coverage test point 576 template <> 577 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16, 578 int32_t &mask0, int32_t &mask1, int32_t &mask2) 579 { 580 mask0 = _mm256_movemask_pd(vEdgeFix16[0]); 581 mask1 = _mm256_movemask_pd(vEdgeFix16[1]); 582 mask2 = _mm256_movemask_pd(vEdgeFix16[2]); 583 } 584 585 ////////////////////////////////////////////////////////////////////////// 586 /// @struct ComputeScissorEdges 587 /// @brief Primary template definition. Allows the function to be generically 588 /// called. When paired with below specializations, will result in an empty 589 /// inlined function if scissor is not enabled 590 /// @tparam RasterScissorEdgesT: is scissor enabled? 591 /// @tparam IsConservativeT: is conservative rast enabled? 592 /// @tparam RT: rasterizer traits 593 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT> 594 struct ComputeScissorEdges 595 { 596 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 597 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){}; 598 }; 599 600 ////////////////////////////////////////////////////////////////////////// 601 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial 602 /// specialization. Instantiated when conservative rast and scissor are enabled 603 template <typename RT> 604 struct ComputeScissorEdges<std::true_type, std::true_type, RT> 605 { 606 ////////////////////////////////////////////////////////////////////////// 607 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, 608 /// evaluate edge equations and offset them away from pixel center. 609 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 610 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) 611 { 612 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used 613 SWR_RECT scissor; 614 scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin); 615 scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax); 616 scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin); 617 scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax); 618 619 POS topLeft{scissor.xmin, scissor.ymin}; 620 POS bottomLeft{scissor.xmin, scissor.ymax}; 621 POS topRight{scissor.xmax, scissor.ymin}; 622 POS bottomRight{scissor.xmax, scissor.ymax}; 623 624 // construct 4 scissor edges in ccw direction 625 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); 626 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); 627 ComputeEdgeData(bottomRight, topRight, rastEdges[5]); 628 ComputeEdgeData(topRight, topLeft, rastEdges[6]); 629 630 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); 631 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); 632 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); 633 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); 634 635 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing 636 adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]); 637 adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]); 638 adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]); 639 adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]); 640 641 // Upper left rule for scissor 642 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); 643 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); 644 } 645 }; 646 647 ////////////////////////////////////////////////////////////////////////// 648 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial 649 /// specialization. Instantiated when scissor is enabled and conservative rast 650 /// is disabled. 651 template <typename RT> 652 struct ComputeScissorEdges<std::true_type, std::false_type, RT> 653 { 654 ////////////////////////////////////////////////////////////////////////// 655 /// @brief Compute scissor edge vectors and evaluate edge equations 656 INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 657 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) 658 { 659 const SWR_RECT &scissor = scissorBBox; 660 POS topLeft{scissor.xmin, scissor.ymin}; 661 POS bottomLeft{scissor.xmin, scissor.ymax}; 662 POS topRight{scissor.xmax, scissor.ymin}; 663 POS bottomRight{scissor.xmax, scissor.ymax}; 664 665 // construct 4 scissor edges in ccw direction 666 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); 667 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); 668 ComputeEdgeData(bottomRight, topRight, rastEdges[5]); 669 ComputeEdgeData(topRight, topLeft, rastEdges[6]); 670 671 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); 672 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); 673 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); 674 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); 675 676 // Upper left rule for scissor 677 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); 678 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); 679 } 680 }; 681 682 ////////////////////////////////////////////////////////////////////////// 683 /// @brief Primary function template for TrivialRejectTest. Should 684 /// never be called, but TemplateUnroller instantiates a few unused values, 685 /// so it calls a runtime assert instead of a static_assert. 686 template <typename ValidEdgeMaskT> 687 INLINE bool TrivialRejectTest(const int, const int, const int) 688 { 689 SWR_INVALID("Primary templated function should never be called"); 690 return false; 691 }; 692 693 ////////////////////////////////////////////////////////////////////////// 694 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0 695 /// and edge 1 for trivial coverage reject 696 template <> 697 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int) 698 { 699 return (!(mask0 && mask1)) ? true : false; 700 }; 701 702 ////////////////////////////////////////////////////////////////////////// 703 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0 704 /// and edge 2 for trivial coverage reject 705 template <> 706 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2) 707 { 708 return (!(mask0 && mask2)) ? true : false; 709 }; 710 711 ////////////////////////////////////////////////////////////////////////// 712 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1 713 /// and edge 2 for trivial coverage reject 714 template <> 715 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2) 716 { 717 return (!(mask1 && mask2)) ? true : false; 718 }; 719 720 ////////////////////////////////////////////////////////////////////////// 721 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all 722 /// primitive edges for trivial coverage reject 723 template <> 724 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2) 725 { 726 return (!(mask0 && mask1 && mask2)) ? true : false;; 727 }; 728 729 ////////////////////////////////////////////////////////////////////////// 730 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate 731 /// point, so return false and rasterize against conservative BBox 732 template <> 733 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int) 734 { 735 return false; 736 }; 737 738 ////////////////////////////////////////////////////////////////////////// 739 /// @brief Primary function template for TrivialAcceptTest. Always returns 740 /// false, since it will only be called for degenerate tris, and as such 741 /// will never cover the entire raster tile 742 template <typename ScissorEnableT> 743 INLINE bool TrivialAcceptTest(const int, const int, const int) 744 { 745 return false; 746 }; 747 748 ////////////////////////////////////////////////////////////////////////// 749 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all 750 /// edge masks for a fully covered raster tile 751 template <> 752 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2) 753 { 754 return ((mask0 & mask1 & mask2) == 0xf); 755 }; 756 757 ////////////////////////////////////////////////////////////////////////// 758 /// @brief Primary function template for GenerateSVInnerCoverage. Results 759 /// in an empty function call if SVInnerCoverage isn't requested 760 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> 761 struct GenerateSVInnerCoverage 762 { 763 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){}; 764 }; 765 766 ////////////////////////////////////////////////////////////////////////// 767 /// @brief Specialization of GenerateSVInnerCoverage where all edges 768 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated 769 /// edge values from OuterConservative to InnerConservative and rasterizes. 770 template <typename RT> 771 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT> 772 { 773 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask) 774 { 775 SWR_CONTEXT *pContext = pDC->pContext; 776 777 double startQuadEdgesAdj[RT::NumEdgesT::value]; 778 for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 779 { 780 startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); 781 } 782 783 // not trivial accept or reject, must rasterize full tile 784 AR_BEGIN(BERasterizePartial, pDC->drawId); 785 innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges); 786 AR_END(BERasterizePartial, 0); 787 } 788 }; 789 790 ////////////////////////////////////////////////////////////////////////// 791 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results 792 /// in an empty function call if SVInnerCoverage isn't requested 793 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> 794 struct UpdateEdgeMasksInnerConservative 795 { 796 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*, 797 const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){}; 798 }; 799 800 ////////////////////////////////////////////////////////////////////////// 801 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges 802 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges 803 /// evaluated at raster tile corners to inner conservative position and 804 /// updates edge masks 805 template <typename RT> 806 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT> 807 { 808 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, 809 const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2) 810 { 811 __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]}; 812 813 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer 814 // conservative evaluated edge when adjusting the edge in for inner conservative tests 815 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]); 816 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]); 817 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]); 818 819 UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); 820 } 821 }; 822 823 ////////////////////////////////////////////////////////////////////////// 824 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage 825 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot 826 /// cover an entire raster tile, set mask0 to 0 to force it down the 827 /// rastierizePartialTile path 828 template <typename RT, typename ValidEdgeMaskT> 829 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT> 830 { 831 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*, 832 const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &) 833 { 834 // set one mask to zero to force the triangle down the rastierizePartialTile path 835 mask0 = 0; 836 } 837 }; 838 839 template <typename RT> 840 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) 841 { 842 SWR_CONTEXT *pContext = pDC->pContext; 843 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); 844 #if KNOB_ENABLE_TOSS_POINTS 845 if (KNOB_TOSS_BIN_TRIS) 846 { 847 return; 848 } 849 #endif 850 AR_BEGIN(BERasterizeTriangle, pDC->drawId); 851 AR_BEGIN(BETriangleSetup, pDC->drawId); 852 853 const API_STATE &state = GetApiState(pDC); 854 const SWR_RASTSTATE &rastState = state.rastState; 855 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; 856 857 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; 858 triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; 859 860 __m128 vX, vY, vZ, vRecipW; 861 862 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care 863 // eg: vX = [x0 x1 x2 dc] 864 vX = _mm_load_ps(workDesc.pTriBuffer); 865 vY = _mm_load_ps(workDesc.pTriBuffer + 4); 866 vZ = _mm_load_ps(workDesc.pTriBuffer + 8); 867 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); 868 869 // convert to fixed point 870 static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision"); 871 __m128i vXi = fpToFixedPoint(vX); 872 __m128i vYi = fpToFixedPoint(vY); 873 874 // quantize floating point position to fixed point precision 875 // to prevent attribute creep around the triangle vertices 876 vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); 877 vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); 878 879 // triangle setup - A and B edge equation coefs 880 __m128 vA, vB; 881 triangleSetupAB(vX, vY, vA, vB); 882 883 __m128i vAi, vBi; 884 triangleSetupABInt(vXi, vYi, vAi, vBi); 885 886 // determinant 887 float det = calcDeterminantInt(vAi, vBi); 888 889 // Verts in Pixel Coordinate Space at this point 890 // Det > 0 = CW winding order 891 // Convert CW triangles to CCW 892 if (det > 0.0) 893 { 894 vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); 895 vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); 896 vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); 897 vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); 898 det = -det; 899 } 900 901 __m128 vC; 902 // Finish triangle setup - C edge coef 903 triangleSetupC(vX, vY, vA, vB, vC); 904 905 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) 906 { 907 // If we have degenerate edge(s) to rasterize, set I and J coefs 908 // to 0 for constant interpolation of attributes 909 triDesc.I[0] = 0.0f; 910 triDesc.I[1] = 0.0f; 911 triDesc.I[2] = 0.0f; 912 triDesc.J[0] = 0.0f; 913 triDesc.J[1] = 0.0f; 914 triDesc.J[2] = 0.0f; 915 916 // Degenerate triangles have no area 917 triDesc.recipDet = 0.0f; 918 } 919 else 920 { 921 // only extract coefs for 2 of the barycentrics; the 3rd can be 922 // determined from the barycentric equation: 923 // i + j + k = 1 <=> k = 1 - j - i 924 _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); 925 _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); 926 _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); 927 _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); 928 _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); 929 _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); 930 931 // compute recipDet, used to calculate barycentric i and j in the backend 932 triDesc.recipDet = 1.0f/det; 933 } 934 935 OSALIGNSIMD(float) oneOverW[4]; 936 _mm_store_ps(oneOverW, vRecipW); 937 triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; 938 triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; 939 triDesc.OneOverW[2] = oneOverW[2]; 940 941 // calculate perspective correct coefs per vertex attrib 942 float* pPerspAttribs = perspAttribsTLS; 943 float* pAttribs = workDesc.pAttribs; 944 triDesc.pPerspAttribs = pPerspAttribs; 945 triDesc.pAttribs = pAttribs; 946 float *pRecipW = workDesc.pTriBuffer + 12; 947 triDesc.pRecipW = pRecipW; 948 __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); 949 __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1); 950 __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1); 951 for(uint32_t i = 0; i < workDesc.numAttribs; i++) 952 { 953 __m128 attribA = _mm_load_ps(pAttribs); 954 __m128 attribB = _mm_load_ps(pAttribs+=4); 955 __m128 attribC = _mm_load_ps(pAttribs+=4); 956 pAttribs+=4; 957 958 attribA = _mm_mul_ps(attribA, vOneOverWV0); 959 attribB = _mm_mul_ps(attribB, vOneOverWV1); 960 attribC = _mm_mul_ps(attribC, vOneOverWV2); 961 962 _mm_store_ps(pPerspAttribs, attribA); 963 _mm_store_ps(pPerspAttribs+=4, attribB); 964 _mm_store_ps(pPerspAttribs+=4, attribC); 965 pPerspAttribs+=4; 966 } 967 968 // compute bary Z 969 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) 970 OSALIGNSIMD(float) a[4]; 971 _mm_store_ps(a, vZ); 972 triDesc.Z[0] = a[0] - a[2]; 973 triDesc.Z[1] = a[1] - a[2]; 974 triDesc.Z[2] = a[2]; 975 976 // add depth bias 977 triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); 978 979 // Calc bounding box of triangle 980 OSALIGNSIMD(SWR_RECT) bbox; 981 calcBoundingBoxInt(vXi, vYi, bbox); 982 983 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; 984 985 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) 986 { 987 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid 988 bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++; 989 SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, 990 "Conservative rast degenerate handling requires a valid scissor rect"); 991 } 992 993 // Intersect with scissor/viewport 994 OSALIGNSIMD(SWR_RECT) intersect; 995 intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin); 996 intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax); 997 intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin); 998 intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax); 999 1000 triDesc.triFlags = workDesc.triFlags; 1001 1002 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox 1003 uint32_t macroX, macroY; 1004 MacroTileMgr::getTileIndices(macroTile, macroX, macroY); 1005 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; 1006 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; 1007 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; 1008 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; 1009 1010 intersect.xmin = std::max(intersect.xmin, macroBoxLeft); 1011 intersect.ymin = std::max(intersect.ymin, macroBoxTop); 1012 intersect.xmax = std::min(intersect.xmax, macroBoxRight); 1013 intersect.ymax = std::min(intersect.ymax, macroBoxBottom); 1014 1015 SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0); 1016 1017 AR_END(BETriangleSetup, 0); 1018 1019 // update triangle desc 1020 uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); 1021 uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); 1022 uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); 1023 uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); 1024 uint32_t numTilesX = maxTileX - minTileX + 1; 1025 uint32_t numTilesY = maxTileY - minTileY + 1; 1026 1027 if (numTilesX == 0 || numTilesY == 0) 1028 { 1029 RDTSC_EVENT(BEEmptyTriangle, 1, 0); 1030 AR_END(BERasterizeTriangle, 1); 1031 return; 1032 } 1033 1034 AR_BEGIN(BEStepSetup, pDC->drawId); 1035 1036 // Step to pixel center of top-left pixel of the triangle bbox 1037 // Align intersect bbox (top/left) to raster tile's (top/left). 1038 int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); 1039 int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); 1040 1041 // convenience typedef 1042 typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT; 1043 1044 // single sample rasterization evaluates edges at pixel center, 1045 // multisample evaluates edges UL pixel corner and steps to each sample position 1046 if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value) 1047 { 1048 // Add 0.5, in fixed point, to offset to pixel center 1049 x += (FIXED_POINT_SCALE / 2); 1050 y += (FIXED_POINT_SCALE / 2); 1051 } 1052 1053 __m128i vTopLeftX = _mm_set1_epi32(x); 1054 __m128i vTopLeftY = _mm_set1_epi32(y); 1055 1056 // evaluate edge equations at top-left pixel using 64bit math 1057 // 1058 // line = Ax + By + C 1059 // solving for C: 1060 // C = -Ax - By 1061 // we know x0 and y0 are on the line; plug them in: 1062 // C = -Ax0 - By0 1063 // plug C back into line equation: 1064 // line = Ax - By - Ax0 - By0 1065 // line = A(x - x0) + B(y - y0) 1066 // dX = (x-x0), dY = (y-y0) 1067 // so all this simplifies to 1068 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within 1069 1070 __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); 1071 __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); 1072 1073 // evaluate A(dx) and B(dY) for all points 1074 __m256d vAipd = _mm256_cvtepi32_pd(vAi); 1075 __m256d vBipd = _mm256_cvtepi32_pd(vBi); 1076 __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); 1077 __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); 1078 1079 __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); 1080 __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); 1081 __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); 1082 1083 // apply any edge adjustments(top-left, crast, etc) 1084 adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge); 1085 1086 // broadcast respective edge results to all lanes 1087 double* pEdge = (double*)&vEdge; 1088 __m256d vEdgeFix16[7]; 1089 vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); 1090 vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); 1091 vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); 1092 1093 OSALIGNSIMD(int32_t) aAi[4], aBi[4]; 1094 _mm_store_si128((__m128i*)aAi, vAi); 1095 _mm_store_si128((__m128i*)aBi, vBi); 1096 EDGE rastEdges[RT::NumEdgesT::value]; 1097 1098 // Compute and store triangle edge data 1099 ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); 1100 ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); 1101 ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); 1102 1103 // Compute and store triangle edge data if scissor needs to rasterized 1104 ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT> 1105 (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); 1106 1107 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile 1108 // used to for testing if entire raster tile is inside a triangle 1109 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1110 { 1111 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); 1112 } 1113 1114 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox 1115 // step sample positions to the raster tile bbox of multisample points 1116 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) 1117 // | | 1118 // | | 1119 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) 1120 __m256d vEdgeTileBbox[3]; 1121 if (NumCoverageSamplesT::value > 1) 1122 { 1123 const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; 1124 const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX(); 1125 const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY(); 1126 1127 __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); 1128 __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); 1129 1130 // step edge equation tests from Tile 1131 // used to for testing if entire raster tile is inside a triangle 1132 for (uint32_t e = 0; e < 3; ++e) 1133 { 1134 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); 1135 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); 1136 vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); 1137 1138 // adjust for msaa tile bbox edges outward for conservative rast, if enabled 1139 adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]); 1140 } 1141 } 1142 1143 AR_END(BEStepSetup, 0); 1144 1145 uint32_t tY = minTileY; 1146 uint32_t tX = minTileX; 1147 uint32_t maxY = maxTileY; 1148 uint32_t maxX = maxTileX; 1149 1150 RenderOutputBuffers renderBuffers, currentRenderBufferRow; 1151 GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex); 1152 currentRenderBufferRow = renderBuffers; 1153 1154 // rasterize and generate coverage masks per sample 1155 for (uint32_t tileY = tY; tileY <= maxY; ++tileY) 1156 { 1157 __m256d vStartOfRowEdge[RT::NumEdgesT::value]; 1158 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1159 { 1160 vStartOfRowEdge[e] = vEdgeFix16[e]; 1161 } 1162 1163 for (uint32_t tileX = tX; tileX <= maxX; ++tileX) 1164 { 1165 triDesc.anyCoveredSamples = 0; 1166 1167 // is the corner of the edge outside of the raster tile? (vEdge < 0) 1168 int mask0, mask1, mask2; 1169 UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2); 1170 1171 for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++) 1172 { 1173 // trivial reject, at least one edge has all 4 corners of raster tile outside 1174 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2); 1175 1176 if (!trivialReject) 1177 { 1178 // trivial accept mask 1179 triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; 1180 1181 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled 1182 UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT> 1183 (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); 1184 1185 // @todo Make this a bit smarter to allow use of trivial accept when: 1186 // 1) scissor/vp intersection rect is raster tile aligned 1187 // 2) raster tile is entirely within scissor/vp intersection rect 1188 if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2)) 1189 { 1190 // trivial accept, all 4 corners of all 3 edges are negative 1191 // i.e. raster tile completely inside triangle 1192 triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; 1193 if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value) 1194 { 1195 triDesc.innerCoverageMask = 0xffffffffffffffffULL; 1196 } 1197 RDTSC_EVENT(BETrivialAccept, 1, 0); 1198 } 1199 else 1200 { 1201 __m256d vEdgeAtSample[RT::NumEdgesT::value]; 1202 if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value) 1203 { 1204 // should get optimized out for single sample case (global value numbering or copy propagation) 1205 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1206 { 1207 vEdgeAtSample[e] = vEdgeFix16[e]; 1208 } 1209 } 1210 else 1211 { 1212 const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; 1213 __m128i vSampleOffsetXh = samplePos.vXi(sampleNum); 1214 __m128i vSampleOffsetYh = samplePos.vYi(sampleNum); 1215 __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); 1216 __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); 1217 1218 // step edge equation tests from UL tile corner to pixel sample position 1219 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1220 { 1221 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); 1222 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); 1223 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); 1224 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); 1225 } 1226 } 1227 1228 double startQuadEdges[RT::NumEdgesT::value]; 1229 const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); 1230 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1231 { 1232 _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); 1233 } 1234 1235 // not trivial accept or reject, must rasterize full tile 1236 AR_BEGIN(BERasterizePartial, pDC->drawId); 1237 triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges); 1238 AR_END(BERasterizePartial, 0); 1239 1240 triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 1241 1242 // Output SV InnerCoverage, if needed 1243 GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask); 1244 } 1245 } 1246 else 1247 { 1248 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything 1249 if(NumCoverageSamplesT::value > 1) 1250 { 1251 triDesc.coverageMask[sampleNum] = 0; 1252 } 1253 RDTSC_EVENT(BETrivialReject, 1, 0); 1254 } 1255 } 1256 1257 #if KNOB_ENABLE_TOSS_POINTS 1258 if(KNOB_TOSS_RS) 1259 { 1260 gToss = triDesc.coverageMask[0]; 1261 } 1262 else 1263 #endif 1264 if(triDesc.anyCoveredSamples) 1265 { 1266 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered 1267 // copy conservative coverage result to all samples 1268 if(RT::IsConservativeT::value) 1269 { 1270 auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; }; 1271 UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage); 1272 } 1273 1274 AR_BEGIN(BEPixelBackend, pDC->drawId); 1275 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); 1276 AR_END(BEPixelBackend, 0); 1277 } 1278 1279 // step to the next tile in X 1280 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1281 { 1282 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); 1283 } 1284 StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers); 1285 } 1286 1287 // step to the next tile in Y 1288 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1289 { 1290 vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); 1291 } 1292 StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow); 1293 } 1294 1295 AR_END(BERasterizeTriangle, 1); 1296 } 1297 1298 // Get pointers to hot tile memory for color RT, depth, stencil 1299 template <uint32_t numSamples> 1300 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex) 1301 { 1302 const API_STATE& state = GetApiState(pDC); 1303 SWR_CONTEXT *pContext = pDC->pContext; 1304 1305 uint32_t mx, my; 1306 MacroTileMgr::getTileIndices(macroID, mx, my); 1307 tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; 1308 tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; 1309 1310 // compute tile offset for active hottile buffers 1311 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; 1312 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); 1313 offset*=numSamples; 1314 1315 unsigned long rtSlot = 0; 1316 uint32_t colorHottileEnableMask = state.colorHottileEnable; 1317 while(_BitScanForward(&rtSlot, colorHottileEnableMask)) 1318 { 1319 HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 1320 numSamples, renderTargetArrayIndex); 1321 pColor->state = HOTTILE_DIRTY; 1322 renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset; 1323 1324 colorHottileEnableMask &= ~(1 << rtSlot); 1325 } 1326 if(state.depthHottileEnable) 1327 { 1328 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; 1329 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); 1330 offset*=numSamples; 1331 HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 1332 numSamples, renderTargetArrayIndex); 1333 pDepth->state = HOTTILE_DIRTY; 1334 SWR_ASSERT(pDepth->pBuffer != nullptr); 1335 renderBuffers.pDepth = pDepth->pBuffer + offset; 1336 } 1337 if(state.stencilHottileEnable) 1338 { 1339 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; 1340 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); 1341 offset*=numSamples; 1342 HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 1343 numSamples, renderTargetArrayIndex); 1344 pStencil->state = HOTTILE_DIRTY; 1345 SWR_ASSERT(pStencil->pBuffer != nullptr); 1346 renderBuffers.pStencil = pStencil->pBuffer + offset; 1347 } 1348 } 1349 1350 template <typename RT> 1351 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers) 1352 { 1353 DWORD rt = 0; 1354 while (_BitScanForward(&rt, colorHotTileMask)) 1355 { 1356 colorHotTileMask &= ~(1 << rt); 1357 buffers.pColor[rt] += RT::colorRasterTileStep; 1358 } 1359 1360 buffers.pDepth += RT::depthRasterTileStep; 1361 buffers.pStencil += RT::stencilRasterTileStep; 1362 } 1363 1364 template <typename RT> 1365 INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) 1366 { 1367 DWORD rt = 0; 1368 while (_BitScanForward(&rt, colorHotTileMask)) 1369 { 1370 colorHotTileMask &= ~(1 << rt); 1371 startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; 1372 buffers.pColor[rt] = startBufferRow.pColor[rt]; 1373 } 1374 startBufferRow.pDepth += RT::depthRasterTileRowStep; 1375 buffers.pDepth = startBufferRow.pDepth; 1376 1377 startBufferRow.pStencil += RT::stencilRasterTileRowStep; 1378 buffers.pStencil = startBufferRow.pStencil; 1379 } 1380 1381