1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file rasterizer.cpp 24 * 25 * @brief Implementation for the rasterizer. 26 * 27 ******************************************************************************/ 28 29 #include <vector> 30 #include <algorithm> 31 32 #include "rasterizer.h" 33 #include "rdtsc_core.h" 34 #include "backend.h" 35 #include "utils.h" 36 #include "frontend.h" 37 #include "tilemgr.h" 38 #include "memory/tilingtraits.h" 39 40 template <uint32_t numSamples = 1> 41 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex); 42 template <typename RT> 43 void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers); 44 template <typename RT> 45 void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); 46 47 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} 48 const __m256d gMaskToVecpd[] = 49 { 50 MASKTOVEC(0, 0, 0, 0), 51 MASKTOVEC(0, 0, 0, 1), 52 MASKTOVEC(0, 0, 1, 0), 53 MASKTOVEC(0, 0, 1, 1), 54 MASKTOVEC(0, 1, 0, 0), 55 MASKTOVEC(0, 1, 0, 1), 56 MASKTOVEC(0, 1, 1, 0), 57 MASKTOVEC(0, 1, 1, 1), 58 MASKTOVEC(1, 0, 0, 0), 59 MASKTOVEC(1, 0, 0, 1), 60 MASKTOVEC(1, 0, 1, 0), 61 MASKTOVEC(1, 0, 1, 1), 62 MASKTOVEC(1, 1, 0, 0), 63 MASKTOVEC(1, 1, 0, 1), 64 MASKTOVEC(1, 1, 1, 0), 65 MASKTOVEC(1, 1, 1, 1), 66 }; 67 68 struct POS 69 { 70 int32_t x, y; 71 }; 72 73 struct EDGE 74 { 75 double a, b; // a, b edge coefficients in fix8 76 double stepQuadX; // step to adjacent horizontal quad in fix16 77 double stepQuadY; // step to adjacent vertical quad in fix16 78 double stepRasterTileX; // step to adjacent horizontal raster tile in fix16 79 double stepRasterTileY; // step to adjacent vertical raster tile in fix16 80 81 __m256d vQuadOffsets; // offsets for 4 samples of a quad 82 __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile 83 }; 84 85 ////////////////////////////////////////////////////////////////////////// 86 /// @brief rasterize a raster tile partially covered by the triangle 87 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile 88 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) 89 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. 90 /// Used to step between quads when sweeping over the raster tile. 91 template<uint32_t NumEdges, typename EdgeMaskT> 92 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges) 93 { 94 uint64_t coverageMask = 0; 95 96 __m256d vEdges[NumEdges]; 97 __m256d vStepX[NumEdges]; 98 __m256d vStepY[NumEdges]; 99 100 for (uint32_t e = 0; e < NumEdges; ++e) 101 { 102 // Step to the pixel sample locations of the 1st quad 103 vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets); 104 105 // compute step to next quad (mul by 2 in x and y direction) 106 vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX); 107 vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY); 108 } 109 110 // fast unrolled version for 8x8 tile 111 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 112 int edgeMask[NumEdges]; 113 uint64_t mask; 114 115 auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);}; 116 auto update_lambda = [&](int e){mask &= edgeMask[e];}; 117 auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);}; 118 auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);}; 119 auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);}; 120 121 // evaluate which pixels in the quad are covered 122 #define EVAL \ 123 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda); 124 125 // update coverage mask 126 // if edge 0 is degenerate and will be skipped; init the mask 127 #define UPDATE_MASK(bit) \ 128 if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\ 129 mask = 0xf;\ 130 }\ 131 else{\ 132 mask = edgeMask[0]; \ 133 }\ 134 UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \ 135 coverageMask |= (mask << bit); 136 137 // step in the +x direction to the next quad 138 #define INCX \ 139 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda); 140 141 // step in the +y direction to the next quad 142 #define INCY \ 143 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda); 144 145 // step in the -x direction to the next quad 146 #define DECX \ 147 UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda); 148 149 // sweep 2x2 quad back and forth through the raster tile, 150 // computing coverage masks for the entire tile 151 152 // raster tile 153 // 0 1 2 3 4 5 6 7 154 // x x 155 // x x ------------------> 156 // x x | 157 // <-----------------x x V 158 // .. 159 160 // row 0 161 EVAL; 162 UPDATE_MASK(0); 163 INCX; 164 EVAL; 165 UPDATE_MASK(4); 166 INCX; 167 EVAL; 168 UPDATE_MASK(8); 169 INCX; 170 EVAL; 171 UPDATE_MASK(12); 172 INCY; 173 174 //row 1 175 EVAL; 176 UPDATE_MASK(28); 177 DECX; 178 EVAL; 179 UPDATE_MASK(24); 180 DECX; 181 EVAL; 182 UPDATE_MASK(20); 183 DECX; 184 EVAL; 185 UPDATE_MASK(16); 186 INCY; 187 188 // row 2 189 EVAL; 190 UPDATE_MASK(32); 191 INCX; 192 EVAL; 193 UPDATE_MASK(36); 194 INCX; 195 EVAL; 196 UPDATE_MASK(40); 197 INCX; 198 EVAL; 199 UPDATE_MASK(44); 200 INCY; 201 202 // row 3 203 EVAL; 204 UPDATE_MASK(60); 205 DECX; 206 EVAL; 207 UPDATE_MASK(56); 208 DECX; 209 EVAL; 210 UPDATE_MASK(52); 211 DECX; 212 EVAL; 213 UPDATE_MASK(48); 214 #else 215 uint32_t bit = 0; 216 for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y) 217 { 218 __m256d vStartOfRowEdge[NumEdges]; 219 for (uint32_t e = 0; e < NumEdges; ++e) 220 { 221 vStartOfRowEdge[e] = vEdges[e]; 222 } 223 224 for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x) 225 { 226 int edgeMask[NumEdges]; 227 for (uint32_t e = 0; e < NumEdges; ++e) 228 { 229 edgeMask[e] = _mm256_movemask_pd(vEdges[e]); 230 } 231 232 uint64_t mask = edgeMask[0]; 233 for (uint32_t e = 1; e < NumEdges; ++e) 234 { 235 mask &= edgeMask[e]; 236 } 237 coverageMask |= (mask << bit); 238 239 // step to the next pixel in the x 240 for (uint32_t e = 0; e < NumEdges; ++e) 241 { 242 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); 243 } 244 bit+=4; 245 } 246 247 // step to the next row 248 for (uint32_t e = 0; e < NumEdges; ++e) 249 { 250 vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]); 251 } 252 } 253 #endif 254 return coverageMask; 255 256 } 257 // Top left rule: 258 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge 259 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge 260 // Top left: a sample is in if it is a top or left edge. 261 // Out: !(horizontal && above) = !horizontal && below 262 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right 263 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) 264 { 265 // if vA < 0, vC-- 266 // if vA == 0 && vB < 0, vC-- 267 268 __m256d vEdgeOut = vEdge; 269 __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); 270 271 // if vA < 0 (line is not horizontal and below) 272 int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); 273 274 // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) 275 __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); 276 int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); 277 msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); 278 279 // if either of these are true and we're on the line (edge == 0), bump it outside the line 280 vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); 281 } 282 283 ////////////////////////////////////////////////////////////////////////// 284 /// @brief calculates difference in precision between the result of manh 285 /// calculation and the edge precision, based on compile time trait values 286 template<typename RT> 287 constexpr int64_t ManhToEdgePrecisionAdjust() 288 { 289 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, 290 "Inadequate precision of result of manh calculation "); 291 return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value); 292 } 293 294 ////////////////////////////////////////////////////////////////////////// 295 /// @struct adjustEdgeConservative 296 /// @brief Primary template definition used for partially specializing 297 /// the adjustEdgeConservative function. This struct should never 298 /// be instantiated. 299 /// @tparam RT: rasterizer traits 300 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting? 301 template <typename RT, typename ConservativeEdgeOffsetT> 302 struct adjustEdgeConservative 303 { 304 ////////////////////////////////////////////////////////////////////////// 305 /// @brief Performs calculations to adjust each edge of a triangle away 306 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y 307 /// direction. 308 /// 309 /// Uncertainty regions arise from fixed point rounding, which 310 /// can snap a vertex +/- by min fixed point value. 311 /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners. 312 /// This allows the rasterizer to test for coverage only at the pixel center, 313 /// instead of having to test individual pixel corners for conservative coverage 314 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) 315 { 316 // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away 317 // from the pixel center (in the direction of the edge normal A/B) 318 319 // edge = Ax + Bx + C - (manh/e) 320 // manh = manhattan distance = abs(A) + abs(B) 321 // e = absolute rounding error from snapping from float to fixed point precision 322 323 // 'fixed point' multiply (in double to be avx1 friendly) 324 // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example 325 __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); 326 __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), 327 _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); 328 329 static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, 330 "Inadequate precision of result of manh calculation "); 331 332 // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision 333 // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right 334 manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5)); 335 336 // move the edge away from the pixel center by the required conservative precision + 1/2 pixel 337 // this allows the rasterizer to do a single conservative coverage test to see if the primitive 338 // intersects the pixel at all 339 vEdge = _mm256_sub_pd(vEdge, manh); 340 }; 341 }; 342 343 ////////////////////////////////////////////////////////////////////////// 344 /// @brief adjustEdgeConservative specialization where no edge offset is needed 345 template <typename RT> 346 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>> 347 { 348 INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {}; 349 }; 350 351 ////////////////////////////////////////////////////////////////////////// 352 /// @brief calculates the distance a degenerate BBox needs to be adjusted 353 /// for conservative rast based on compile time trait values 354 template<typename RT> 355 constexpr int64_t ConservativeScissorOffset() 356 { 357 static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision"); 358 // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges 359 typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT; 360 // 1/2 pixel edge offset + conservative offset - degenerateTriangle 361 return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value)); 362 } 363 364 ////////////////////////////////////////////////////////////////////////// 365 /// @brief Performs calculations to adjust each a vector of evaluated edges out 366 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y 367 /// direction. 368 template <typename RT> 369 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge) 370 { 371 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); 372 int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>(); 373 vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh)); 374 }; 375 376 ////////////////////////////////////////////////////////////////////////// 377 /// @brief Performs calculations to adjust each a scalar evaluated edge out 378 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y 379 /// direction. 380 template <typename RT, typename OffsetT> 381 INLINE double adjustScalarEdge(const double a, const double b, const double Edge) 382 { 383 int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); 384 int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>(); 385 return (Edge - manh); 386 }; 387 388 ////////////////////////////////////////////////////////////////////////// 389 /// @brief Perform any needed adjustments to evaluated triangle edges 390 template <typename RT, typename EdgeOffsetT> 391 struct adjustEdgesFix16 392 { 393 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) 394 { 395 static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, 396 "Edge equation expected to be in x.16 fixed point"); 397 398 static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled"); 399 400 // need to apply any edge offsets before applying the top-left rule 401 adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge); 402 403 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); 404 } 405 }; 406 407 ////////////////////////////////////////////////////////////////////////// 408 /// @brief Perform top left adjustments to evaluated triangle edges 409 template <typename RT> 410 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>> 411 { 412 INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) 413 { 414 adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); 415 } 416 }; 417 418 // max(abs(dz/dx), abs(dz,dy) 419 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) 420 { 421 /* 422 // evaluate i,j at (0,0) 423 float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; 424 float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; 425 426 // evaluate i,j at (1,0) 427 float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; 428 float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; 429 430 // compute dz/dx 431 float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; 432 float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; 433 float dzdx = abs(d10 - d00); 434 435 // evaluate i,j at (0,1) 436 float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; 437 float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; 438 439 float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; 440 float dzdy = abs(d01 - d00); 441 */ 442 443 // optimized version of above 444 float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); 445 float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); 446 447 return std::max(dzdx, dzdy); 448 } 449 450 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) 451 { 452 if (pState->depthFormat == R24_UNORM_X8_TYPELESS) 453 { 454 return (1.0f / (1 << 24)); 455 } 456 else if (pState->depthFormat == R16_UNORM) 457 { 458 return (1.0f / (1 << 16)); 459 } 460 else 461 { 462 SWR_ASSERT(pState->depthFormat == R32_FLOAT); 463 464 // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) 465 float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); 466 uint32_t zMaxInt = *(uint32_t*)&zMax; 467 zMaxInt &= 0x7f800000; 468 zMax = *(float*)&zMaxInt; 469 470 return zMax * (1.0f / (1 << 23)); 471 } 472 } 473 474 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) 475 { 476 if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) 477 { 478 return 0.0f; 479 } 480 481 float scale = pState->slopeScaledDepthBias; 482 if (scale != 0.0f) 483 { 484 scale *= ComputeMaxDepthSlope(pTri); 485 } 486 487 float bias = pState->depthBias; 488 if (!pState->depthBiasPreAdjusted) 489 { 490 bias *= ComputeBiasFactor(pState, pTri, z); 491 } 492 bias += scale; 493 494 if (pState->depthBiasClamp > 0.0f) 495 { 496 bias = std::min(bias, pState->depthBiasClamp); 497 } 498 else if (pState->depthBiasClamp < 0.0f) 499 { 500 bias = std::max(bias, pState->depthBiasClamp); 501 } 502 503 return bias; 504 } 505 506 // Prevent DCE by writing coverage mask from rasterizer to volatile 507 #if KNOB_ENABLE_TOSS_POINTS 508 __declspec(thread) volatile uint64_t gToss; 509 #endif 510 511 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; 512 // try to avoid _chkstk insertions; make this thread local 513 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib]; 514 515 INLINE 516 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) 517 { 518 edge.a = a; 519 edge.b = b; 520 521 // compute constant steps to adjacent quads 522 edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE)); 523 edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE)); 524 525 // compute constant steps to adjacent raster tiles 526 edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE)); 527 edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE)); 528 529 // compute quad offsets 530 const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); 531 const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); 532 533 __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8); 534 __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8); 535 edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16); 536 537 // compute raster tile offsets 538 const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0); 539 const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0); 540 541 __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8); 542 __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8); 543 edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16); 544 } 545 546 INLINE 547 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) 548 { 549 ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge); 550 } 551 552 ////////////////////////////////////////////////////////////////////////// 553 /// @brief Primary template definition used for partially specializing 554 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel 555 /// corner to sample position, and test for coverage 556 /// @tparam sampleCount: multisample count 557 template <typename NumSamplesT> 558 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, 559 int32_t &mask0, int32_t &mask1, int32_t &mask2) 560 { 561 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; 562 // evaluate edge equations at the tile multisample bounding box 563 vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); 564 vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); 565 vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); 566 mask0 = _mm256_movemask_pd(vSampleBboxTest0); 567 mask1 = _mm256_movemask_pd(vSampleBboxTest1); 568 mask2 = _mm256_movemask_pd(vSampleBboxTest2); 569 } 570 571 ////////////////////////////////////////////////////////////////////////// 572 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated 573 /// when only rasterizing a single coverage test point 574 template <> 575 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16, 576 int32_t &mask0, int32_t &mask1, int32_t &mask2) 577 { 578 mask0 = _mm256_movemask_pd(vEdgeFix16[0]); 579 mask1 = _mm256_movemask_pd(vEdgeFix16[1]); 580 mask2 = _mm256_movemask_pd(vEdgeFix16[2]); 581 } 582 583 ////////////////////////////////////////////////////////////////////////// 584 /// @struct ComputeScissorEdges 585 /// @brief Primary template definition. Allows the function to be generically 586 /// called. When paired with below specializations, will result in an empty 587 /// inlined function if scissor is not enabled 588 /// @tparam RasterScissorEdgesT: is scissor enabled? 589 /// @tparam IsConservativeT: is conservative rast enabled? 590 /// @tparam RT: rasterizer traits 591 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT> 592 struct ComputeScissorEdges 593 { 594 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 595 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){}; 596 }; 597 598 ////////////////////////////////////////////////////////////////////////// 599 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial 600 /// specialization. Instantiated when conservative rast and scissor are enabled 601 template <typename RT> 602 struct ComputeScissorEdges<std::true_type, std::true_type, RT> 603 { 604 ////////////////////////////////////////////////////////////////////////// 605 /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, 606 /// evaluate edge equations and offset them away from pixel center. 607 INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 608 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) 609 { 610 // if conservative rasterizing, triangle bbox intersected with scissor bbox is used 611 SWR_RECT scissor; 612 scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin); 613 scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax); 614 scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin); 615 scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax); 616 617 POS topLeft{scissor.xmin, scissor.ymin}; 618 POS bottomLeft{scissor.xmin, scissor.ymax}; 619 POS topRight{scissor.xmax, scissor.ymin}; 620 POS bottomRight{scissor.xmax, scissor.ymax}; 621 622 // construct 4 scissor edges in ccw direction 623 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); 624 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); 625 ComputeEdgeData(bottomRight, topRight, rastEdges[5]); 626 ComputeEdgeData(topRight, topLeft, rastEdges[6]); 627 628 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); 629 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); 630 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); 631 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); 632 633 // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing 634 adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]); 635 adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]); 636 adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]); 637 adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]); 638 639 // Upper left rule for scissor 640 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); 641 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); 642 } 643 }; 644 645 ////////////////////////////////////////////////////////////////////////// 646 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial 647 /// specialization. Instantiated when scissor is enabled and conservative rast 648 /// is disabled. 649 template <typename RT> 650 struct ComputeScissorEdges<std::true_type, std::false_type, RT> 651 { 652 ////////////////////////////////////////////////////////////////////////// 653 /// @brief Compute scissor edge vectors and evaluate edge equations 654 INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 655 EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]) 656 { 657 const SWR_RECT &scissor = scissorBBox; 658 POS topLeft{scissor.xmin, scissor.ymin}; 659 POS bottomLeft{scissor.xmin, scissor.ymax}; 660 POS topRight{scissor.xmax, scissor.ymin}; 661 POS bottomRight{scissor.xmax, scissor.ymax}; 662 663 // construct 4 scissor edges in ccw direction 664 ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]); 665 ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]); 666 ComputeEdgeData(bottomRight, topRight, rastEdges[5]); 667 ComputeEdgeData(topRight, topLeft, rastEdges[6]); 668 669 vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin))); 670 vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax))); 671 vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax))); 672 vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin))); 673 674 // Upper left rule for scissor 675 vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0)); 676 vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0)); 677 } 678 }; 679 680 ////////////////////////////////////////////////////////////////////////// 681 /// @brief Primary function template for TrivialRejectTest. Should 682 /// never be called, but TemplateUnroller instantiates a few unused values, 683 /// so it calls a runtime assert instead of a static_assert. 684 template <typename ValidEdgeMaskT> 685 INLINE bool TrivialRejectTest(const int, const int, const int) 686 { 687 SWR_ASSERT(0, "Primary templated function should never be called"); 688 return false; 689 }; 690 691 ////////////////////////////////////////////////////////////////////////// 692 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0 693 /// and edge 1 for trivial coverage reject 694 template <> 695 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int) 696 { 697 return (!(mask0 && mask1)) ? true : false; 698 }; 699 700 ////////////////////////////////////////////////////////////////////////// 701 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0 702 /// and edge 2 for trivial coverage reject 703 template <> 704 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2) 705 { 706 return (!(mask0 && mask2)) ? true : false; 707 }; 708 709 ////////////////////////////////////////////////////////////////////////// 710 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1 711 /// and edge 2 for trivial coverage reject 712 template <> 713 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2) 714 { 715 return (!(mask1 && mask2)) ? true : false; 716 }; 717 718 ////////////////////////////////////////////////////////////////////////// 719 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all 720 /// primitive edges for trivial coverage reject 721 template <> 722 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2) 723 { 724 return (!(mask0 && mask1 && mask2)) ? true : false;; 725 }; 726 727 ////////////////////////////////////////////////////////////////////////// 728 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate 729 /// point, so return false and rasterize against conservative BBox 730 template <> 731 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int) 732 { 733 return false; 734 }; 735 736 ////////////////////////////////////////////////////////////////////////// 737 /// @brief Primary function template for TrivialAcceptTest. Always returns 738 /// false, since it will only be called for degenerate tris, and as such 739 /// will never cover the entire raster tile 740 template <typename ScissorEnableT> 741 INLINE bool TrivialAcceptTest(const int, const int, const int) 742 { 743 return false; 744 }; 745 746 ////////////////////////////////////////////////////////////////////////// 747 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all 748 /// edge masks for a fully covered raster tile 749 template <> 750 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2) 751 { 752 return ((mask0 & mask1 & mask2) == 0xf); 753 }; 754 755 ////////////////////////////////////////////////////////////////////////// 756 /// @brief Primary function template for GenerateSVInnerCoverage. Results 757 /// in an empty function call if SVInnerCoverage isn't requested 758 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> 759 struct GenerateSVInnerCoverage 760 { 761 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t &){}; 762 }; 763 764 ////////////////////////////////////////////////////////////////////////// 765 /// @brief Specialization of GenerateSVInnerCoverage where all edges 766 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated 767 /// edge values from OuterConservative to InnerConservative and rasterizes. 768 template <typename RT> 769 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT> 770 { 771 INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask) 772 { 773 SWR_CONTEXT *pContext = pDC->pContext; 774 775 double startQuadEdgesAdj[RT::NumEdgesT::value]; 776 for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 777 { 778 startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); 779 } 780 781 // not trivial accept or reject, must rasterize full tile 782 AR_BEGIN(BERasterizePartial, pDC->drawId); 783 innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges); 784 AR_END(BERasterizePartial, 0); 785 } 786 }; 787 788 ////////////////////////////////////////////////////////////////////////// 789 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results 790 /// in an empty function call if SVInnerCoverage isn't requested 791 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> 792 struct UpdateEdgeMasksInnerConservative 793 { 794 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*, 795 const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){}; 796 }; 797 798 ////////////////////////////////////////////////////////////////////////// 799 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges 800 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges 801 /// evaluated at raster tile corners to inner conservative position and 802 /// updates edge masks 803 template <typename RT> 804 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT> 805 { 806 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, 807 const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2) 808 { 809 __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]}; 810 811 // instead of keeping 2 copies of evaluated edges around, just compensate for the outer 812 // conservative evaluated edge when adjusting the edge in for inner conservative tests 813 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]); 814 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]); 815 adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]); 816 817 UpdateEdgeMasks<typename RT::NumRasterSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); 818 } 819 }; 820 821 ////////////////////////////////////////////////////////////////////////// 822 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage 823 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot 824 /// cover an entire raster tile, set mask0 to 0 to force it down the 825 /// rastierizePartialTile path 826 template <typename RT, typename ValidEdgeMaskT> 827 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT> 828 { 829 INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*, 830 const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &) 831 { 832 // set one mask to zero to force the triangle down the rastierizePartialTile path 833 mask0 = 0; 834 } 835 }; 836 837 template <typename RT> 838 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) 839 { 840 SWR_CONTEXT *pContext = pDC->pContext; 841 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); 842 #if KNOB_ENABLE_TOSS_POINTS 843 if (KNOB_TOSS_BIN_TRIS) 844 { 845 return; 846 } 847 #endif 848 AR_BEGIN(BERasterizeTriangle, pDC->drawId); 849 AR_BEGIN(BETriangleSetup, pDC->drawId); 850 851 const API_STATE &state = GetApiState(pDC); 852 const SWR_RASTSTATE &rastState = state.rastState; 853 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; 854 855 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; 856 triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; 857 858 __m128 vX, vY, vZ, vRecipW; 859 860 // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care 861 // eg: vX = [x0 x1 x2 dc] 862 vX = _mm_load_ps(workDesc.pTriBuffer); 863 vY = _mm_load_ps(workDesc.pTriBuffer + 4); 864 vZ = _mm_load_ps(workDesc.pTriBuffer + 8); 865 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); 866 867 // convert to fixed point 868 static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision"); 869 __m128i vXi = fpToFixedPoint(vX); 870 __m128i vYi = fpToFixedPoint(vY); 871 872 // quantize floating point position to fixed point precision 873 // to prevent attribute creep around the triangle vertices 874 vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); 875 vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); 876 877 // triangle setup - A and B edge equation coefs 878 __m128 vA, vB; 879 triangleSetupAB(vX, vY, vA, vB); 880 881 __m128i vAi, vBi; 882 triangleSetupABInt(vXi, vYi, vAi, vBi); 883 884 // determinant 885 float det = calcDeterminantInt(vAi, vBi); 886 887 // Verts in Pixel Coordinate Space at this point 888 // Det > 0 = CW winding order 889 // Convert CW triangles to CCW 890 if (det > 0.0) 891 { 892 vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); 893 vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); 894 vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); 895 vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); 896 det = -det; 897 } 898 899 __m128 vC; 900 // Finish triangle setup - C edge coef 901 triangleSetupC(vX, vY, vA, vB, vC); 902 903 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) 904 { 905 // If we have degenerate edge(s) to rasterize, set I and J coefs 906 // to 0 for constant interpolation of attributes 907 triDesc.I[0] = 0.0f; 908 triDesc.I[1] = 0.0f; 909 triDesc.I[2] = 0.0f; 910 triDesc.J[0] = 0.0f; 911 triDesc.J[1] = 0.0f; 912 triDesc.J[2] = 0.0f; 913 914 // Degenerate triangles have no area 915 triDesc.recipDet = 0.0f; 916 } 917 else 918 { 919 // only extract coefs for 2 of the barycentrics; the 3rd can be 920 // determined from the barycentric equation: 921 // i + j + k = 1 <=> k = 1 - j - i 922 _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); 923 _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); 924 _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); 925 _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); 926 _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); 927 _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); 928 929 // compute recipDet, used to calculate barycentric i and j in the backend 930 triDesc.recipDet = 1.0f/det; 931 } 932 933 OSALIGNSIMD(float) oneOverW[4]; 934 _mm_store_ps(oneOverW, vRecipW); 935 triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; 936 triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; 937 triDesc.OneOverW[2] = oneOverW[2]; 938 939 // calculate perspective correct coefs per vertex attrib 940 float* pPerspAttribs = perspAttribsTLS; 941 float* pAttribs = workDesc.pAttribs; 942 triDesc.pPerspAttribs = pPerspAttribs; 943 triDesc.pAttribs = pAttribs; 944 float *pRecipW = workDesc.pTriBuffer + 12; 945 triDesc.pRecipW = pRecipW; 946 __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); 947 __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1); 948 __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1); 949 for(uint32_t i = 0; i < workDesc.numAttribs; i++) 950 { 951 __m128 attribA = _mm_load_ps(pAttribs); 952 __m128 attribB = _mm_load_ps(pAttribs+=4); 953 __m128 attribC = _mm_load_ps(pAttribs+=4); 954 pAttribs+=4; 955 956 attribA = _mm_mul_ps(attribA, vOneOverWV0); 957 attribB = _mm_mul_ps(attribB, vOneOverWV1); 958 attribC = _mm_mul_ps(attribC, vOneOverWV2); 959 960 _mm_store_ps(pPerspAttribs, attribA); 961 _mm_store_ps(pPerspAttribs+=4, attribB); 962 _mm_store_ps(pPerspAttribs+=4, attribC); 963 pPerspAttribs+=4; 964 } 965 966 // compute bary Z 967 // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) 968 OSALIGNSIMD(float) a[4]; 969 _mm_store_ps(a, vZ); 970 triDesc.Z[0] = a[0] - a[2]; 971 triDesc.Z[1] = a[1] - a[2]; 972 triDesc.Z[2] = a[2]; 973 974 // add depth bias 975 triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); 976 977 // Calc bounding box of triangle 978 OSALIGNSIMD(SWR_RECT) bbox; 979 calcBoundingBoxInt(vXi, vYi, bbox); 980 981 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; 982 983 if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) 984 { 985 // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid 986 bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++; 987 SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, 988 "Conservative rast degenerate handling requires a valid scissor rect"); 989 } 990 991 // Intersect with scissor/viewport 992 OSALIGNSIMD(SWR_RECT) intersect; 993 intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin); 994 intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax); 995 intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin); 996 intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax); 997 998 triDesc.triFlags = workDesc.triFlags; 999 1000 // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox 1001 uint32_t macroX, macroY; 1002 MacroTileMgr::getTileIndices(macroTile, macroX, macroY); 1003 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; 1004 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; 1005 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; 1006 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; 1007 1008 intersect.xmin = std::max(intersect.xmin, macroBoxLeft); 1009 intersect.ymin = std::max(intersect.ymin, macroBoxTop); 1010 intersect.xmax = std::min(intersect.xmax, macroBoxRight); 1011 intersect.ymax = std::min(intersect.ymax, macroBoxBottom); 1012 1013 SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0); 1014 1015 AR_END(BETriangleSetup, 0); 1016 1017 // update triangle desc 1018 uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); 1019 uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); 1020 uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); 1021 uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); 1022 uint32_t numTilesX = maxTileX - minTileX + 1; 1023 uint32_t numTilesY = maxTileY - minTileY + 1; 1024 1025 if (numTilesX == 0 || numTilesY == 0) 1026 { 1027 RDTSC_EVENT(BEEmptyTriangle, 1, 0); 1028 AR_END(BERasterizeTriangle, 1); 1029 return; 1030 } 1031 1032 AR_BEGIN(BEStepSetup, pDC->drawId); 1033 1034 // Step to pixel center of top-left pixel of the triangle bbox 1035 // Align intersect bbox (top/left) to raster tile's (top/left). 1036 int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); 1037 int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); 1038 1039 // convenience typedef 1040 typedef typename RT::NumRasterSamplesT NumRasterSamplesT; 1041 1042 // single sample rasterization evaluates edges at pixel center, 1043 // multisample evaluates edges UL pixel corner and steps to each sample position 1044 if(std::is_same<NumRasterSamplesT, SingleSampleT>::value) 1045 { 1046 // Add 0.5, in fixed point, to offset to pixel center 1047 x += (FIXED_POINT_SCALE / 2); 1048 y += (FIXED_POINT_SCALE / 2); 1049 } 1050 1051 __m128i vTopLeftX = _mm_set1_epi32(x); 1052 __m128i vTopLeftY = _mm_set1_epi32(y); 1053 1054 // evaluate edge equations at top-left pixel using 64bit math 1055 // 1056 // line = Ax + By + C 1057 // solving for C: 1058 // C = -Ax - By 1059 // we know x0 and y0 are on the line; plug them in: 1060 // C = -Ax0 - By0 1061 // plug C back into line equation: 1062 // line = Ax - By - Ax0 - By0 1063 // line = A(x - x0) + B(y - y0) 1064 // dX = (x-x0), dY = (y-y0) 1065 // so all this simplifies to 1066 // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within 1067 1068 __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); 1069 __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); 1070 1071 // evaluate A(dx) and B(dY) for all points 1072 __m256d vAipd = _mm256_cvtepi32_pd(vAi); 1073 __m256d vBipd = _mm256_cvtepi32_pd(vBi); 1074 __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); 1075 __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); 1076 1077 __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); 1078 __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); 1079 __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); 1080 1081 // apply any edge adjustments(top-left, crast, etc) 1082 adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge); 1083 1084 // broadcast respective edge results to all lanes 1085 double* pEdge = (double*)&vEdge; 1086 __m256d vEdgeFix16[7]; 1087 vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); 1088 vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); 1089 vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); 1090 1091 OSALIGNSIMD(int32_t) aAi[4], aBi[4]; 1092 _mm_store_si128((__m128i*)aAi, vAi); 1093 _mm_store_si128((__m128i*)aBi, vBi); 1094 EDGE rastEdges[RT::NumEdgesT::value]; 1095 1096 // Compute and store triangle edge data 1097 ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]); 1098 ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]); 1099 ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]); 1100 1101 // Compute and store triangle edge data if scissor needs to rasterized 1102 ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT> 1103 (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); 1104 1105 // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile 1106 // used to for testing if entire raster tile is inside a triangle 1107 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1108 { 1109 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); 1110 } 1111 1112 // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox 1113 // step sample positions to the raster tile bbox of multisample points 1114 // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) 1115 // | | 1116 // | | 1117 // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) 1118 __m256d vEdgeTileBbox[3]; 1119 if (NumRasterSamplesT::value > 1) 1120 { 1121 __m128i vTileSampleBBoxXh = RT::MT::TileSampleOffsetsX(); 1122 __m128i vTileSampleBBoxYh = RT::MT::TileSampleOffsetsY(); 1123 1124 __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); 1125 __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); 1126 1127 // step edge equation tests from Tile 1128 // used to for testing if entire raster tile is inside a triangle 1129 for (uint32_t e = 0; e < 3; ++e) 1130 { 1131 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); 1132 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); 1133 vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); 1134 1135 // adjust for msaa tile bbox edges outward for conservative rast, if enabled 1136 adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]); 1137 } 1138 } 1139 1140 AR_END(BEStepSetup, 0); 1141 1142 uint32_t tY = minTileY; 1143 uint32_t tX = minTileX; 1144 uint32_t maxY = maxTileY; 1145 uint32_t maxX = maxTileX; 1146 1147 RenderOutputBuffers renderBuffers, currentRenderBufferRow; 1148 GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex); 1149 currentRenderBufferRow = renderBuffers; 1150 1151 // rasterize and generate coverage masks per sample 1152 for (uint32_t tileY = tY; tileY <= maxY; ++tileY) 1153 { 1154 __m256d vStartOfRowEdge[RT::NumEdgesT::value]; 1155 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1156 { 1157 vStartOfRowEdge[e] = vEdgeFix16[e]; 1158 } 1159 1160 for (uint32_t tileX = tX; tileX <= maxX; ++tileX) 1161 { 1162 triDesc.anyCoveredSamples = 0; 1163 1164 // is the corner of the edge outside of the raster tile? (vEdge < 0) 1165 int mask0, mask1, mask2; 1166 UpdateEdgeMasks<NumRasterSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2); 1167 1168 for (uint32_t sampleNum = 0; sampleNum < NumRasterSamplesT::value; sampleNum++) 1169 { 1170 // trivial reject, at least one edge has all 4 corners of raster tile outside 1171 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2); 1172 1173 if (!trivialReject) 1174 { 1175 // trivial accept mask 1176 triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; 1177 1178 // Update the raster tile edge masks based on inner conservative edge offsets, if enabled 1179 UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT> 1180 (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); 1181 1182 // @todo Make this a bit smarter to allow use of trivial accept when: 1183 // 1) scissor/vp intersection rect is raster tile aligned 1184 // 2) raster tile is entirely within scissor/vp intersection rect 1185 if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2)) 1186 { 1187 // trivial accept, all 4 corners of all 3 edges are negative 1188 // i.e. raster tile completely inside triangle 1189 triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; 1190 if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value) 1191 { 1192 triDesc.innerCoverageMask = 0xffffffffffffffffULL; 1193 } 1194 RDTSC_EVENT(BETrivialAccept, 1, 0); 1195 } 1196 else 1197 { 1198 __m256d vEdgeAtSample[RT::NumEdgesT::value]; 1199 if(std::is_same<NumRasterSamplesT, SingleSampleT>::value) 1200 { 1201 // should get optimized out for single sample case (global value numbering or copy propagation) 1202 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1203 { 1204 vEdgeAtSample[e] = vEdgeFix16[e]; 1205 } 1206 } 1207 else 1208 { 1209 __m128i vSampleOffsetXh = RT::MT::vXi(sampleNum); 1210 __m128i vSampleOffsetYh = RT::MT::vYi(sampleNum); 1211 __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); 1212 __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); 1213 1214 // step edge equation tests from UL tile corner to pixel sample position 1215 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1216 { 1217 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); 1218 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); 1219 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); 1220 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); 1221 } 1222 } 1223 1224 double startQuadEdges[RT::NumEdgesT::value]; 1225 const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); 1226 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1227 { 1228 _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); 1229 } 1230 1231 // not trivial accept or reject, must rasterize full tile 1232 AR_BEGIN(BERasterizePartial, pDC->drawId); 1233 triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges); 1234 AR_END(BERasterizePartial, 0); 1235 1236 triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 1237 1238 // Output SV InnerCoverage, if needed 1239 GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask); 1240 } 1241 } 1242 else 1243 { 1244 // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything 1245 if(NumRasterSamplesT::value > 1) 1246 { 1247 triDesc.coverageMask[sampleNum] = 0; 1248 } 1249 RDTSC_EVENT(BETrivialReject, 1, 0); 1250 } 1251 } 1252 1253 #if KNOB_ENABLE_TOSS_POINTS 1254 if(KNOB_TOSS_RS) 1255 { 1256 gToss = triDesc.coverageMask[0]; 1257 } 1258 else 1259 #endif 1260 if(triDesc.anyCoveredSamples) 1261 { 1262 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered 1263 // copy conservative coverage result to all samples 1264 if(RT::IsConservativeT::value) 1265 { 1266 auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; }; 1267 UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage); 1268 } 1269 1270 AR_BEGIN(BEPixelBackend, pDC->drawId); 1271 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); 1272 AR_END(BEPixelBackend, 0); 1273 } 1274 1275 // step to the next tile in X 1276 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1277 { 1278 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); 1279 } 1280 StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers); 1281 } 1282 1283 // step to the next tile in Y 1284 for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e) 1285 { 1286 vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); 1287 } 1288 StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow); 1289 } 1290 1291 AR_END(BERasterizeTriangle, 1); 1292 } 1293 1294 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) 1295 { 1296 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; 1297 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; 1298 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; 1299 1300 bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0; 1301 1302 // load point vertex 1303 float x = *workDesc.pTriBuffer; 1304 float y = *(workDesc.pTriBuffer + 1); 1305 float z = *(workDesc.pTriBuffer + 2); 1306 1307 // create a copy of the triangle buffer to write our adjusted vertices to 1308 OSALIGNSIMD(float) newTriBuffer[4 * 4]; 1309 TRIANGLE_WORK_DESC newWorkDesc = workDesc; 1310 newWorkDesc.pTriBuffer = &newTriBuffer[0]; 1311 1312 // create a copy of the attrib buffer to write our adjusted attribs to 1313 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES]; 1314 newWorkDesc.pAttribs = &newAttribBuffer[0]; 1315 1316 newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer; 1317 newWorkDesc.numAttribs = workDesc.numAttribs; 1318 newWorkDesc.triFlags = workDesc.triFlags; 1319 1320 // construct two tris by bloating point by point size 1321 float halfPointSize = workDesc.triFlags.pointSize * 0.5f; 1322 float lowerX = x - halfPointSize; 1323 float upperX = x + halfPointSize; 1324 float lowerY = y - halfPointSize; 1325 float upperY = y + halfPointSize; 1326 1327 // tri 0 1328 float *pBuf = &newTriBuffer[0]; 1329 *pBuf++ = lowerX; 1330 *pBuf++ = lowerX; 1331 *pBuf++ = upperX; 1332 pBuf++; 1333 *pBuf++ = lowerY; 1334 *pBuf++ = upperY; 1335 *pBuf++ = upperY; 1336 pBuf++; 1337 _mm_store_ps(pBuf, _mm_set1_ps(z)); 1338 _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f)); 1339 1340 // setup triangle rasterizer function 1341 PFN_WORK_FUNC pfnTriRast; 1342 // for center sample pattern, all samples are at pixel center; calculate coverage 1343 // once at center and broadcast the results in the backend 1344 uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; 1345 // conservative rast not supported for points/lines 1346 pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (pDC->pState->state.scissorsTileAligned == false)); 1347 1348 // overwrite texcoords for point sprites 1349 if (isPointSpriteTexCoordEnabled) 1350 { 1351 // copy original attribs 1352 memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float)); 1353 newWorkDesc.pAttribs = &newAttribBuffer[0]; 1354 1355 // overwrite texcoord for point sprites 1356 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; 1357 DWORD texCoordAttrib = 0; 1358 1359 while (_BitScanForward(&texCoordAttrib, texCoordMask)) 1360 { 1361 texCoordMask &= ~(1 << texCoordAttrib); 1362 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib; 1363 if (rastState.pointSpriteTopOrigin) 1364 { 1365 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0); 1366 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0); 1367 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1); 1368 } 1369 else 1370 { 1371 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0); 1372 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0); 1373 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1); 1374 } 1375 } 1376 } 1377 else 1378 { 1379 // no texcoord overwrite, can reuse the attrib buffer from frontend 1380 newWorkDesc.pAttribs = workDesc.pAttribs; 1381 } 1382 1383 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); 1384 1385 // tri 1 1386 pBuf = &newTriBuffer[0]; 1387 *pBuf++ = lowerX; 1388 *pBuf++ = upperX; 1389 *pBuf++ = upperX; 1390 pBuf++; 1391 *pBuf++ = lowerY; 1392 *pBuf++ = upperY; 1393 *pBuf++ = lowerY; 1394 // z, w unchanged 1395 1396 if (isPointSpriteTexCoordEnabled) 1397 { 1398 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask; 1399 DWORD texCoordAttrib = 0; 1400 1401 while (_BitScanForward(&texCoordAttrib, texCoordMask)) 1402 { 1403 texCoordMask &= ~(1 << texCoordAttrib); 1404 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib; 1405 if (rastState.pointSpriteTopOrigin) 1406 { 1407 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0); 1408 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1); 1409 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1); 1410 1411 } 1412 else 1413 { 1414 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0); 1415 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1); 1416 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1); 1417 } 1418 } 1419 } 1420 1421 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); 1422 } 1423 1424 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData) 1425 { 1426 SWR_CONTEXT *pContext = pDC->pContext; 1427 1428 #if KNOB_ENABLE_TOSS_POINTS 1429 if (KNOB_TOSS_BIN_TRIS) 1430 { 1431 return; 1432 } 1433 #endif 1434 1435 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData; 1436 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; 1437 1438 // map x,y relative offsets from start of raster tile to bit position in 1439 // coverage mask for the point 1440 static const uint32_t coverageMap[8][8] = { 1441 { 0, 1, 4, 5, 8, 9, 12, 13 }, 1442 { 2, 3, 6, 7, 10, 11, 14, 15 }, 1443 { 16, 17, 20, 21, 24, 25, 28, 29 }, 1444 { 18, 19, 22, 23, 26, 27, 30, 31 }, 1445 { 32, 33, 36, 37, 40, 41, 44, 45 }, 1446 { 34, 35, 38, 39, 42, 43, 46, 47 }, 1447 { 48, 49, 52, 53, 56, 57, 60, 61 }, 1448 { 50, 51, 54, 55, 58, 59, 62, 63 } 1449 }; 1450 1451 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; 1452 1453 // pull point information from triangle buffer 1454 // @todo use structs for readability 1455 uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer; 1456 uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1); 1457 float z = *(workDesc.pTriBuffer + 2); 1458 1459 // construct triangle descriptor for point 1460 // no interpolation, set up i,j for constant interpolation of z and attribs 1461 // @todo implement an optimized backend that doesn't require triangle information 1462 1463 // compute coverage mask from x,y packed into the coverageMask flag 1464 // mask indices by the maximum valid index for x/y of coveragemap. 1465 uint32_t tX = workDesc.triFlags.coverageMask & 0x7; 1466 uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7; 1467 // todo: multisample points? 1468 triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX]; 1469 1470 // no persp divide needed for points 1471 triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; 1472 triDesc.triFlags = workDesc.triFlags; 1473 triDesc.recipDet = 1.0f; 1474 triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f; 1475 triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f; 1476 triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f; 1477 triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z; 1478 1479 RenderOutputBuffers renderBuffers; 1480 GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 1481 renderBuffers, triDesc.triFlags.renderTargetArrayIndex); 1482 1483 AR_BEGIN(BEPixelBackend, pDC->drawId); 1484 backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); 1485 AR_END(BEPixelBackend, 0); 1486 } 1487 1488 // Get pointers to hot tile memory for color RT, depth, stencil 1489 template <uint32_t numSamples> 1490 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex) 1491 { 1492 const API_STATE& state = GetApiState(pDC); 1493 SWR_CONTEXT *pContext = pDC->pContext; 1494 1495 uint32_t mx, my; 1496 MacroTileMgr::getTileIndices(macroID, mx, my); 1497 tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; 1498 tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; 1499 1500 // compute tile offset for active hottile buffers 1501 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8; 1502 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); 1503 offset*=numSamples; 1504 1505 unsigned long rtSlot = 0; 1506 uint32_t colorHottileEnableMask = state.colorHottileEnable; 1507 while(_BitScanForward(&rtSlot, colorHottileEnableMask)) 1508 { 1509 HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 1510 numSamples, renderTargetArrayIndex); 1511 pColor->state = HOTTILE_DIRTY; 1512 renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset; 1513 1514 colorHottileEnableMask &= ~(1 << rtSlot); 1515 } 1516 if(state.depthHottileEnable) 1517 { 1518 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8; 1519 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); 1520 offset*=numSamples; 1521 HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 1522 numSamples, renderTargetArrayIndex); 1523 pDepth->state = HOTTILE_DIRTY; 1524 SWR_ASSERT(pDepth->pBuffer != nullptr); 1525 renderBuffers.pDepth = pDepth->pBuffer + offset; 1526 } 1527 if(state.stencilHottileEnable) 1528 { 1529 const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8; 1530 uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY); 1531 offset*=numSamples; 1532 HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 1533 numSamples, renderTargetArrayIndex); 1534 pStencil->state = HOTTILE_DIRTY; 1535 SWR_ASSERT(pStencil->pBuffer != nullptr); 1536 renderBuffers.pStencil = pStencil->pBuffer + offset; 1537 } 1538 } 1539 1540 template <typename RT> 1541 INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers) 1542 { 1543 for(uint32_t rt = 0; rt < NumRT; ++rt) 1544 { 1545 buffers.pColor[rt] += RT::colorRasterTileStep; 1546 } 1547 1548 buffers.pDepth += RT::depthRasterTileStep; 1549 buffers.pStencil += RT::stencilRasterTileStep; 1550 } 1551 1552 template <typename RT> 1553 INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) 1554 { 1555 for(uint32_t rt = 0; rt < NumRT; ++rt) 1556 { 1557 startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; 1558 buffers.pColor[rt] = startBufferRow.pColor[rt]; 1559 } 1560 startBufferRow.pDepth += RT::depthRasterTileRowStep; 1561 buffers.pDepth = startBufferRow.pDepth; 1562 1563 startBufferRow.pStencil += RT::stencilRasterTileRowStep; 1564 buffers.pStencil = startBufferRow.pStencil; 1565 } 1566 1567 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) 1568 { 1569 SWR_CONTEXT *pContext = pDC->pContext; 1570 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData); 1571 #if KNOB_ENABLE_TOSS_POINTS 1572 if (KNOB_TOSS_BIN_TRIS) 1573 { 1574 return; 1575 } 1576 #endif 1577 1578 // bloat line to two tris and call the triangle rasterizer twice 1579 AR_BEGIN(BERasterizeLine, pDC->drawId); 1580 1581 const API_STATE &state = GetApiState(pDC); 1582 const SWR_RASTSTATE &rastState = state.rastState; 1583 1584 // macrotile dimensioning 1585 uint32_t macroX, macroY; 1586 MacroTileMgr::getTileIndices(macroTile, macroX, macroY); 1587 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; 1588 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; 1589 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; 1590 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; 1591 1592 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; 1593 1594 // create a copy of the triangle buffer to write our adjusted vertices to 1595 OSALIGNSIMD(float) newTriBuffer[4 * 4]; 1596 TRIANGLE_WORK_DESC newWorkDesc = workDesc; 1597 newWorkDesc.pTriBuffer = &newTriBuffer[0]; 1598 1599 // create a copy of the attrib buffer to write our adjusted attribs to 1600 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES]; 1601 newWorkDesc.pAttribs = &newAttribBuffer[0]; 1602 1603 const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f); 1604 const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f); 1605 1606 __m128 vX, vY, vZ, vRecipW; 1607 1608 vX = _mm_load_ps(workDesc.pTriBuffer); 1609 vY = _mm_load_ps(workDesc.pTriBuffer + 4); 1610 vZ = _mm_load_ps(workDesc.pTriBuffer + 8); 1611 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); 1612 1613 // triangle 0 1614 // v0,v1 -> v0,v0,v1 1615 __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); 1616 __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); 1617 __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); 1618 __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0)); 1619 1620 __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth); 1621 __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); 1622 if (workDesc.triFlags.yMajor) 1623 { 1624 vXa = _mm_add_ps(vAdjust, vXa); 1625 } 1626 else 1627 { 1628 vYa = _mm_add_ps(vAdjust, vYa); 1629 } 1630 1631 // Store triangle description for rasterizer 1632 _mm_store_ps((float*)&newTriBuffer[0], vXa); 1633 _mm_store_ps((float*)&newTriBuffer[4], vYa); 1634 _mm_store_ps((float*)&newTriBuffer[8], vZa); 1635 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); 1636 1637 // binner bins 3 edges for lines as v0, v1, v1 1638 // tri0 needs v0, v0, v1 1639 for (uint32_t a = 0; a < workDesc.numAttribs; ++a) 1640 { 1641 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]); 1642 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]); 1643 1644 _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0); 1645 _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0); 1646 _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1); 1647 } 1648 1649 // Store user clip distances for triangle 0 1650 float newClipBuffer[3 * 8]; 1651 uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask); 1652 if (numClipDist) 1653 { 1654 newWorkDesc.pUserClipBuffer = newClipBuffer; 1655 1656 float* pOldBuffer = workDesc.pUserClipBuffer; 1657 float* pNewBuffer = newClipBuffer; 1658 for (uint32_t i = 0; i < numClipDist; ++i) 1659 { 1660 // read barycentric coeffs from binner 1661 float a = *(pOldBuffer++); 1662 float b = *(pOldBuffer++); 1663 1664 // reconstruct original clip distance at vertices 1665 float c0 = a + b; 1666 float c1 = b; 1667 1668 // construct triangle barycentrics 1669 *(pNewBuffer++) = c0 - c1; 1670 *(pNewBuffer++) = c0 - c1; 1671 *(pNewBuffer++) = c1; 1672 } 1673 } 1674 1675 // setup triangle rasterizer function 1676 PFN_WORK_FUNC pfnTriRast; 1677 uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; 1678 // conservative rast not supported for points/lines 1679 pfnTriRast = GetRasterizerFunc(sampleCount, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (pDC->pState->state.scissorsTileAligned == false)); 1680 1681 // make sure this macrotile intersects the triangle 1682 __m128i vXai = fpToFixedPoint(vXa); 1683 __m128i vYai = fpToFixedPoint(vYa); 1684 OSALIGNSIMD(SWR_RECT) bboxA; 1685 calcBoundingBoxInt(vXai, vYai, bboxA); 1686 1687 if (!(bboxA.xmin > macroBoxRight || 1688 bboxA.xmin > scissorInFixedPoint.xmax || 1689 bboxA.xmax - 1 < macroBoxLeft || 1690 bboxA.xmax - 1 < scissorInFixedPoint.xmin || 1691 bboxA.ymin > macroBoxBottom || 1692 bboxA.ymin > scissorInFixedPoint.ymax || 1693 bboxA.ymax - 1 < macroBoxTop || 1694 bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { 1695 // rasterize triangle 1696 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); 1697 } 1698 1699 // triangle 1 1700 // v0,v1 -> v1,v1,v0 1701 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); 1702 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); 1703 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); 1704 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1)); 1705 1706 vAdjust = _mm_mul_ps(vLineWidth, vBloat1); 1707 if (workDesc.triFlags.yMajor) 1708 { 1709 vXa = _mm_add_ps(vAdjust, vXa); 1710 } 1711 else 1712 { 1713 vYa = _mm_add_ps(vAdjust, vYa); 1714 } 1715 1716 // Store triangle description for rasterizer 1717 _mm_store_ps((float*)&newTriBuffer[0], vXa); 1718 _mm_store_ps((float*)&newTriBuffer[4], vYa); 1719 _mm_store_ps((float*)&newTriBuffer[8], vZa); 1720 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); 1721 1722 // binner bins 3 edges for lines as v0, v1, v1 1723 // tri1 needs v1, v1, v0 1724 for (uint32_t a = 0; a < workDesc.numAttribs; ++a) 1725 { 1726 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); 1727 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); 1728 1729 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1); 1730 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1); 1731 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0); 1732 } 1733 1734 // store user clip distance for triangle 1 1735 if (numClipDist) 1736 { 1737 float* pOldBuffer = workDesc.pUserClipBuffer; 1738 float* pNewBuffer = newClipBuffer; 1739 for (uint32_t i = 0; i < numClipDist; ++i) 1740 { 1741 // read barycentric coeffs from binner 1742 float a = *(pOldBuffer++); 1743 float b = *(pOldBuffer++); 1744 1745 // reconstruct original clip distance at vertices 1746 float c0 = a + b; 1747 float c1 = b; 1748 1749 // construct triangle barycentrics 1750 *(pNewBuffer++) = c1 - c0; 1751 *(pNewBuffer++) = c1 - c0; 1752 *(pNewBuffer++) = c0; 1753 } 1754 } 1755 1756 vXai = fpToFixedPoint(vXa); 1757 vYai = fpToFixedPoint(vYa); 1758 calcBoundingBoxInt(vXai, vYai, bboxA); 1759 1760 if (!(bboxA.xmin > macroBoxRight || 1761 bboxA.xmin > scissorInFixedPoint.xmax || 1762 bboxA.xmax - 1 < macroBoxLeft || 1763 bboxA.xmax - 1 < scissorInFixedPoint.xmin || 1764 bboxA.ymin > macroBoxBottom || 1765 bboxA.ymin > scissorInFixedPoint.ymax || 1766 bboxA.ymax - 1 < macroBoxTop || 1767 bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { 1768 // rasterize triangle 1769 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); 1770 } 1771 1772 AR_END(BERasterizeLine, 1); 1773 } 1774 1775 struct RasterizerChooser 1776 { 1777 typedef PFN_WORK_FUNC FuncType; 1778 1779 template <typename... ArgsB> 1780 static FuncType GetFunc() 1781 { 1782 return RasterizeTriangle<RasterizerTraits<ArgsB...>>; 1783 } 1784 }; 1785 1786 // Selector for correct templated RasterizeTriangle function 1787 PFN_WORK_FUNC GetRasterizerFunc( 1788 uint32_t numSamples, 1789 bool IsConservative, 1790 uint32_t InputCoverage, 1791 uint32_t EdgeEnable, 1792 bool RasterizeScissorEdges 1793 ) 1794 { 1795 return TemplateArgUnroller<RasterizerChooser>::GetFunc( 1796 IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples}, 1797 IsConservative, 1798 IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage}, 1799 IntArg<0, VALID_TRI_EDGE_COUNT-1>{EdgeEnable}, 1800 RasterizeScissorEdges); 1801 } 1802