1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file binner.cpp 24 * 25 * @brief Implementation for the macrotile binner 26 * 27 ******************************************************************************/ 28 29 #include "context.h" 30 #include "frontend.h" 31 #include "conservativeRast.h" 32 #include "pa.h" 33 #include "rasterizer.h" 34 #include "rdtsc_core.h" 35 #include "tilemgr.h" 36 37 // Function Prototype 38 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); 39 40 ////////////////////////////////////////////////////////////////////////// 41 /// @brief Offsets added to post-viewport vertex positions based on 42 /// raster state. 43 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = 44 { 45 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER 46 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL 47 }; 48 49 ////////////////////////////////////////////////////////////////////////// 50 /// @brief Convert the X,Y coords of a triangle to the requested Fixed 51 /// Point precision from FP32. 52 template <typename PT = FixedPointTraits<Fixed_16_8>> 53 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn) 54 { 55 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value)); 56 return _simd_cvtps_epi32(vFixed); 57 } 58 59 ////////////////////////////////////////////////////////////////////////// 60 /// @brief Helper function to set the X,Y coords of a triangle to the 61 /// requested Fixed Point precision from FP32. 62 /// @param tri: simdvector[3] of FP triangle verts 63 /// @param vXi: fixed point X coords of tri verts 64 /// @param vYi: fixed point Y coords of tri verts 65 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3]) 66 { 67 vXi[0] = fpToFixedPointVertical(tri[0].x); 68 vYi[0] = fpToFixedPointVertical(tri[0].y); 69 vXi[1] = fpToFixedPointVertical(tri[1].x); 70 vYi[1] = fpToFixedPointVertical(tri[1].y); 71 vXi[2] = fpToFixedPointVertical(tri[2].x); 72 vYi[2] = fpToFixedPointVertical(tri[2].y); 73 } 74 75 ////////////////////////////////////////////////////////////////////////// 76 /// @brief Calculate bounding box for current triangle 77 /// @tparam CT: ConservativeRastFETraits type 78 /// @param vX: fixed point X position for triangle verts 79 /// @param vY: fixed point Y position for triangle verts 80 /// @param bbox: fixed point bbox 81 /// *Note*: expects vX, vY to be in the correct precision for the type 82 /// of rasterization. This avoids unnecessary FP->fixed conversions. 83 template <typename CT> 84 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox) 85 { 86 simdscalari vMinX = vX[0]; 87 vMinX = _simd_min_epi32(vMinX, vX[1]); 88 vMinX = _simd_min_epi32(vMinX, vX[2]); 89 90 simdscalari vMaxX = vX[0]; 91 vMaxX = _simd_max_epi32(vMaxX, vX[1]); 92 vMaxX = _simd_max_epi32(vMaxX, vX[2]); 93 94 simdscalari vMinY = vY[0]; 95 vMinY = _simd_min_epi32(vMinY, vY[1]); 96 vMinY = _simd_min_epi32(vMinY, vY[2]); 97 98 simdscalari vMaxY = vY[0]; 99 vMaxY = _simd_max_epi32(vMaxY, vY[1]); 100 vMaxY = _simd_max_epi32(vMaxY, vY[2]); 101 102 bbox.xmin = vMinX; 103 bbox.xmax = vMaxX; 104 bbox.ymin = vMinY; 105 bbox.ymax = vMaxY; 106 } 107 108 ////////////////////////////////////////////////////////////////////////// 109 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical 110 /// Offsets BBox for conservative rast 111 template <> 112 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox) 113 { 114 // FE conservative rast traits 115 typedef FEConservativeRastT CT; 116 117 simdscalari vMinX = vX[0]; 118 vMinX = _simd_min_epi32(vMinX, vX[1]); 119 vMinX = _simd_min_epi32(vMinX, vX[2]); 120 121 simdscalari vMaxX = vX[0]; 122 vMaxX = _simd_max_epi32(vMaxX, vX[1]); 123 vMaxX = _simd_max_epi32(vMaxX, vX[2]); 124 125 simdscalari vMinY = vY[0]; 126 vMinY = _simd_min_epi32(vMinY, vY[1]); 127 vMinY = _simd_min_epi32(vMinY, vY[2]); 128 129 simdscalari vMaxY = vY[0]; 130 vMaxY = _simd_max_epi32(vMaxY, vY[1]); 131 vMaxY = _simd_max_epi32(vMaxY, vY[2]); 132 133 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization 134 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. 135 bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); 136 bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); 137 bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); 138 bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); 139 } 140 141 ////////////////////////////////////////////////////////////////////////// 142 /// @brief Processes attributes for the backend based on linkage mask and 143 /// linkage map. Essentially just doing an SOA->AOS conversion and pack. 144 /// @param pDC - Draw context 145 /// @param pa - Primitive Assembly state 146 /// @param linkageMask - Specifies which VS outputs are routed to PS. 147 /// @param pLinkageMap - maps VS attribute slot to PS slot 148 /// @param triIndex - Triangle to process attributes for 149 /// @param pBuffer - Output result 150 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate> 151 INLINE void ProcessAttributes( 152 DRAW_CONTEXT *pDC, 153 PA_STATE&pa, 154 uint32_t triIndex, 155 uint32_t primId, 156 float *pBuffer) 157 { 158 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT"); 159 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; 160 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation 161 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask; 162 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex; 163 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology; 164 165 static const float constTable[3][4] = { 166 { 0.0f, 0.0f, 0.0f, 0.0f }, 167 { 0.0f, 0.0f, 0.0f, 1.0f }, 168 { 1.0f, 1.0f, 1.0f, 1.0f } 169 }; 170 171 for (uint32_t i = 0; i < backendState.numAttributes; ++i) 172 { 173 uint32_t inputSlot; 174 if (IsSwizzledT::value) 175 { 176 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i]; 177 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib; 178 179 } 180 else 181 { 182 inputSlot = VERTEX_ATTRIB_START_SLOT + i; 183 } 184 185 __m128 attrib[3]; // triangle attribs (always 4 wide) 186 float* pAttribStart = pBuffer; 187 188 if (HasConstantInterpT::value || IsDegenerate::value) 189 { 190 if (_bittest(&constantInterpMask, i)) 191 { 192 uint32_t vid; 193 uint32_t adjustedTriIndex; 194 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 }; 195 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } }; 196 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } }; 197 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } }; 198 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } }; 199 200 switch (topo) { 201 case TOP_QUAD_LIST: 202 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex]; 203 vid = quadProvokingVertex[triIndex & 1][provokingVertex]; 204 break; 205 case TOP_QUAD_STRIP: 206 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex]; 207 vid = qstripProvokingVertex[triIndex & 1][provokingVertex]; 208 break; 209 case TOP_TRIANGLE_STRIP: 210 adjustedTriIndex = triIndex; 211 vid = (triIndex & 1) 212 ? tristripProvokingVertex[provokingVertex] 213 : provokingVertex; 214 break; 215 default: 216 adjustedTriIndex = triIndex; 217 vid = provokingVertex; 218 break; 219 } 220 221 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib); 222 223 for (uint32_t i = 0; i < NumVertsT::value; ++i) 224 { 225 _mm_store_ps(pBuffer, attrib[vid]); 226 pBuffer += 4; 227 } 228 } 229 else 230 { 231 pa.AssembleSingle(inputSlot, triIndex, attrib); 232 233 for (uint32_t i = 0; i < NumVertsT::value; ++i) 234 { 235 _mm_store_ps(pBuffer, attrib[i]); 236 pBuffer += 4; 237 } 238 } 239 } 240 else 241 { 242 pa.AssembleSingle(inputSlot, triIndex, attrib); 243 244 for (uint32_t i = 0; i < NumVertsT::value; ++i) 245 { 246 _mm_store_ps(pBuffer, attrib[i]); 247 pBuffer += 4; 248 } 249 } 250 251 // pad out the attrib buffer to 3 verts to ensure the triangle 252 // interpolation code in the pixel shader works correctly for the 253 // 3 topologies - point, line, tri. This effectively zeros out the 254 // effect of the missing vertices in the triangle interpolation. 255 for (uint32_t v = NumVertsT::value; v < 3; ++v) 256 { 257 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]); 258 pBuffer += 4; 259 } 260 261 // check for constant source overrides 262 if (IsSwizzledT::value) 263 { 264 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask; 265 if (mask) 266 { 267 DWORD comp; 268 while (_BitScanForward(&comp, mask)) 269 { 270 mask &= ~(1 << comp); 271 272 float constantValue = 0.0f; 273 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource) 274 { 275 case SWR_CONSTANT_SOURCE_CONST_0000: 276 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT: 277 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT: 278 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp]; 279 break; 280 case SWR_CONSTANT_SOURCE_PRIM_ID: 281 constantValue = *(float*)&primId; 282 break; 283 } 284 285 // apply constant value to all 3 vertices 286 for (uint32_t v = 0; v < 3; ++v) 287 { 288 pAttribStart[comp + v * 4] = constantValue; 289 } 290 } 291 } 292 } 293 } 294 } 295 296 ////////////////////////////////////////////////////////////////////////// 297 /// @brief Gather scissor rect data based on per-prim viewport indices. 298 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point. 299 /// @param pViewportIndex - array of per-primitive vewport indexes. 300 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data. 301 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data. 302 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data. 303 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. 304 // 305 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. 306 template<size_t SimdWidth> 307 struct GatherScissors 308 { 309 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, 310 simdscalari &scisXmin, simdscalari &scisYmin, 311 simdscalari &scisXmax, simdscalari &scisYmax) 312 { 313 SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather"); 314 } 315 }; 316 317 template<> 318 struct GatherScissors<8> 319 { 320 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, 321 simdscalari &scisXmin, simdscalari &scisYmin, 322 simdscalari &scisXmax, simdscalari &scisYmax) 323 { 324 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, 325 pScissorsInFixedPoint[pViewportIndex[1]].xmin, 326 pScissorsInFixedPoint[pViewportIndex[2]].xmin, 327 pScissorsInFixedPoint[pViewportIndex[3]].xmin, 328 pScissorsInFixedPoint[pViewportIndex[4]].xmin, 329 pScissorsInFixedPoint[pViewportIndex[5]].xmin, 330 pScissorsInFixedPoint[pViewportIndex[6]].xmin, 331 pScissorsInFixedPoint[pViewportIndex[7]].xmin); 332 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, 333 pScissorsInFixedPoint[pViewportIndex[1]].ymin, 334 pScissorsInFixedPoint[pViewportIndex[2]].ymin, 335 pScissorsInFixedPoint[pViewportIndex[3]].ymin, 336 pScissorsInFixedPoint[pViewportIndex[4]].ymin, 337 pScissorsInFixedPoint[pViewportIndex[5]].ymin, 338 pScissorsInFixedPoint[pViewportIndex[6]].ymin, 339 pScissorsInFixedPoint[pViewportIndex[7]].ymin); 340 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, 341 pScissorsInFixedPoint[pViewportIndex[1]].xmax, 342 pScissorsInFixedPoint[pViewportIndex[2]].xmax, 343 pScissorsInFixedPoint[pViewportIndex[3]].xmax, 344 pScissorsInFixedPoint[pViewportIndex[4]].xmax, 345 pScissorsInFixedPoint[pViewportIndex[5]].xmax, 346 pScissorsInFixedPoint[pViewportIndex[6]].xmax, 347 pScissorsInFixedPoint[pViewportIndex[7]].xmax); 348 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, 349 pScissorsInFixedPoint[pViewportIndex[1]].ymax, 350 pScissorsInFixedPoint[pViewportIndex[2]].ymax, 351 pScissorsInFixedPoint[pViewportIndex[3]].ymax, 352 pScissorsInFixedPoint[pViewportIndex[4]].ymax, 353 pScissorsInFixedPoint[pViewportIndex[5]].ymax, 354 pScissorsInFixedPoint[pViewportIndex[6]].ymax, 355 pScissorsInFixedPoint[pViewportIndex[7]].ymax); 356 } 357 }; 358 359 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); 360 361 struct ProcessAttributesChooser 362 { 363 typedef PFN_PROCESS_ATTRIBUTES FuncType; 364 365 template <typename... ArgsB> 366 static FuncType GetFunc() 367 { 368 return ProcessAttributes<ArgsB...>; 369 } 370 }; 371 372 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false) 373 { 374 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate); 375 } 376 377 ////////////////////////////////////////////////////////////////////////// 378 /// @brief Processes enabled user clip distances. Loads the active clip 379 /// distances from the PA, sets up barycentric equations, and 380 /// stores the results to the output buffer 381 /// @param pa - Primitive Assembly state 382 /// @param primIndex - primitive index to process 383 /// @param clipDistMask - mask of enabled clip distances 384 /// @param pUserClipBuffer - buffer to store results 385 template<uint32_t NumVerts> 386 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer) 387 { 388 DWORD clipDist; 389 while (_BitScanForward(&clipDist, clipDistMask)) 390 { 391 clipDistMask &= ~(1 << clipDist); 392 uint32_t clipSlot = clipDist >> 2; 393 uint32_t clipComp = clipDist & 0x3; 394 uint32_t clipAttribSlot = clipSlot == 0 ? 395 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; 396 397 __m128 primClipDist[3]; 398 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); 399 400 float vertClipDist[NumVerts]; 401 for (uint32_t e = 0; e < NumVerts; ++e) 402 { 403 OSALIGNSIMD(float) aVertClipDist[4]; 404 _mm_store_ps(aVertClipDist, primClipDist[e]); 405 vertClipDist[e] = aVertClipDist[clipComp]; 406 }; 407 408 // setup plane equations for barycentric interpolation in the backend 409 float baryCoeff[NumVerts]; 410 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1]; 411 for (uint32_t e = 0; e < NumVerts - 1; ++e) 412 { 413 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last; 414 } 415 baryCoeff[NumVerts - 1] = last; 416 417 for (uint32_t e = 0; e < NumVerts; ++e) 418 { 419 *(pUserClipBuffer++) = baryCoeff[e]; 420 } 421 } 422 } 423 424 ////////////////////////////////////////////////////////////////////////// 425 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping 426 /// culling, viewport transform, etc. 427 /// @param pDC - pointer to draw context. 428 /// @param pa - The primitive assembly object. 429 /// @param workerId - thread's worker id. Even thread has a unique id. 430 /// @param tri - Contains triangle position data for SIMDs worth of triangles. 431 /// @param primID - Primitive ID for each triangle. 432 /// @param viewportIdx - viewport array index for each triangle. 433 /// @tparam CT - ConservativeRastFETraits 434 template <typename CT> 435 void BinTriangles( 436 DRAW_CONTEXT *pDC, 437 PA_STATE& pa, 438 uint32_t workerId, 439 simdvector tri[3], 440 uint32_t triMask, 441 simdscalari primID, 442 simdscalari viewportIdx) 443 { 444 SWR_CONTEXT *pContext = pDC->pContext; 445 446 AR_BEGIN(FEBinTriangles, pDC->drawId); 447 448 const API_STATE& state = GetApiState(pDC); 449 const SWR_RASTSTATE& rastState = state.rastState; 450 const SWR_FRONTEND_STATE& feState = state.frontendState; 451 const SWR_GS_STATE& gsState = state.gsState; 452 MacroTileMgr *pTileMgr = pDC->pTileMgr; 453 454 simdscalar vRecipW0 = _simd_set1_ps(1.0f); 455 simdscalar vRecipW1 = _simd_set1_ps(1.0f); 456 simdscalar vRecipW2 = _simd_set1_ps(1.0f); 457 458 if (feState.vpTransformDisable) 459 { 460 // RHW is passed in directly when VP transform is disabled 461 vRecipW0 = tri[0].v[3]; 462 vRecipW1 = tri[1].v[3]; 463 vRecipW2 = tri[2].v[3]; 464 } 465 else 466 { 467 // Perspective divide 468 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w); 469 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w); 470 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w); 471 472 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0); 473 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1); 474 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2); 475 476 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0); 477 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1); 478 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2); 479 480 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0); 481 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1); 482 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); 483 484 // Viewport transform to screen space coords 485 if (state.gsState.emitsViewportArrayIndex) 486 { 487 viewportTransform<3>(tri, state.vpMatrices, viewportIdx); 488 } 489 else 490 { 491 viewportTransform<3>(tri, state.vpMatrices); 492 } 493 } 494 495 // Adjust for pixel center location 496 simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; 497 tri[0].x = _simd_add_ps(tri[0].x, offset); 498 tri[0].y = _simd_add_ps(tri[0].y, offset); 499 500 tri[1].x = _simd_add_ps(tri[1].x, offset); 501 tri[1].y = _simd_add_ps(tri[1].y, offset); 502 503 tri[2].x = _simd_add_ps(tri[2].x, offset); 504 tri[2].y = _simd_add_ps(tri[2].y, offset); 505 506 simdscalari vXi[3], vYi[3]; 507 // Set vXi, vYi to required fixed point precision 508 FPToFixedPoint(tri, vXi, vYi); 509 510 // triangle setup 511 simdscalari vAi[3], vBi[3]; 512 triangleSetupABIntVertical(vXi, vYi, vAi, vBi); 513 514 // determinant 515 simdscalari vDet[2]; 516 calcDeterminantIntVertical(vAi, vBi, vDet); 517 518 // cull zero area 519 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si()))); 520 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si()))); 521 522 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); 523 524 uint32_t origTriMask = triMask; 525 // don't cull degenerate triangles if we're conservatively rasterizing 526 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value) 527 { 528 triMask &= ~cullZeroAreaMask; 529 } 530 531 // determine front winding tris 532 // CW +det 533 // CCW det < 0; 534 // 0 area triangles are marked as backfacing regardless of winding order, 535 // which is required behavior for conservative rast and wireframe rendering 536 uint32_t frontWindingTris; 537 if (rastState.frontWinding == SWR_FRONTWINDING_CW) 538 { 539 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si()))); 540 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si()))); 541 } 542 else 543 { 544 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0]))); 545 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1]))); 546 } 547 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); 548 549 // cull 550 uint32_t cullTris; 551 switch ((SWR_CULLMODE)rastState.cullMode) 552 { 553 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; 554 case SWR_CULLMODE_NONE: cullTris = 0x0; break; 555 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; 556 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast 557 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; 558 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; 559 } 560 561 triMask &= ~cullTris; 562 563 if (origTriMask ^ triMask) 564 { 565 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); 566 } 567 568 // Simple non-conformant wireframe mode, useful for debugging 569 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME) 570 { 571 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD 572 simdvector line[2]; 573 simdscalar recipW[2]; 574 line[0] = tri[0]; 575 line[1] = tri[1]; 576 recipW[0] = vRecipW0; 577 recipW[1] = vRecipW1; 578 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); 579 580 line[0] = tri[1]; 581 line[1] = tri[2]; 582 recipW[0] = vRecipW1; 583 recipW[1] = vRecipW2; 584 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); 585 586 line[0] = tri[2]; 587 line[1] = tri[0]; 588 recipW[0] = vRecipW2; 589 recipW[1] = vRecipW0; 590 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); 591 592 AR_END(FEBinTriangles, 1); 593 return; 594 } 595 596 /// Note: these variable initializations must stay above any 'goto endBenTriangles' 597 // compute per tri backface 598 uint32_t frontFaceMask = frontWindingTris; 599 uint32_t *pPrimID = (uint32_t *)&primID; 600 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; 601 DWORD triIndex = 0; 602 // for center sample pattern, all samples are at pixel center; calculate coverage 603 // once at center and broadcast the results in the backend 604 const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; 605 uint32_t edgeEnable; 606 PFN_WORK_FUNC pfnWork; 607 if (CT::IsConservativeT::value) 608 { 609 // determine which edges of the degenerate tri, if any, are valid to rasterize. 610 // used to call the appropriate templated rasterizer function 611 if (cullZeroAreaMask > 0) 612 { 613 // e0 = v1-v0 614 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]); 615 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]); 616 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask))); 617 618 // e1 = v2-v1 619 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]); 620 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]); 621 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask))); 622 623 // e2 = v0-v2 624 // if v0 == v1 & v1 == v2, v0 == v2 625 uint32_t e2Mask = e0Mask & e1Mask; 626 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512"); 627 628 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2 629 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001 630 e0Mask = pdep_u32(e0Mask, 0x00249249); 631 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010 632 e1Mask = pdep_u32(e1Mask, 0x00492492); 633 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100 634 e2Mask = pdep_u32(e2Mask, 0x00924924); 635 636 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask))); 637 } 638 else 639 { 640 edgeEnable = 0x00FFFFFF; 641 } 642 } 643 else 644 { 645 // degenerate triangles won't be sent to rasterizer; just enable all edges 646 pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), 647 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID, 648 (state.scissorsTileAligned == false)); 649 } 650 651 if (!triMask) 652 { 653 goto endBinTriangles; 654 } 655 656 // Calc bounding box of triangles 657 simdBBox bbox; 658 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox); 659 660 // determine if triangle falls between pixel centers and discard 661 // only discard for non-MSAA case and when conservative rast is disabled 662 // (xmin + 127) & ~255 663 // (xmax + 128) & ~255 664 if (rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value)) 665 { 666 origTriMask = triMask; 667 668 int cullCenterMask; 669 { 670 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127)); 671 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255)); 672 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128)); 673 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255)); 674 675 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax); 676 677 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127)); 678 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255)); 679 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128)); 680 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255)); 681 682 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax); 683 vMaskV = _simd_or_si(vMaskH, vMaskV); 684 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV)); 685 } 686 687 triMask &= ~cullCenterMask; 688 689 if (origTriMask ^ triMask) 690 { 691 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); 692 } 693 } 694 695 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. 696 // Gather the AOS effective scissor rects based on the per-prim VP index. 697 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. 698 simdscalari scisXmin, scisYmin, scisXmax, scisYmax; 699 if (state.gsState.emitsViewportArrayIndex) 700 { 701 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, 702 scisXmin, scisYmin, scisXmax, scisYmax); 703 } 704 else // broadcast fast path for non-VPAI case. 705 { 706 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); 707 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); 708 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); 709 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); 710 } 711 712 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); 713 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); 714 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); 715 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); 716 717 if (CT::IsConservativeT::value) 718 { 719 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has 720 // some area. Bump the xmax/ymax edges out 721 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax); 722 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom); 723 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax); 724 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight); 725 } 726 727 // Cull tris completely outside scissor 728 { 729 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); 730 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); 731 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); 732 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); 733 triMask = triMask & ~maskOutsideScissor; 734 } 735 736 if (!triMask) 737 { 738 goto endBinTriangles; 739 } 740 741 // Convert triangle bbox to macrotile units. 742 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); 743 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); 744 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); 745 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); 746 747 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; 748 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); 749 _simd_store_si((simdscalari*)aMTRight, bbox.xmax); 750 _simd_store_si((simdscalari*)aMTTop, bbox.ymin); 751 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); 752 753 // transpose verts needed for backend 754 /// @todo modify BE to take non-transformed verts 755 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; 756 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); 757 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); 758 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); 759 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2); 760 761 // store render target array index 762 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; 763 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) 764 { 765 simdvector vRtai[3]; 766 pa.Assemble(VERTEX_RTAI_SLOT, vRtai); 767 simdscalari vRtaii; 768 vRtaii = _simd_castps_si(vRtai[0].x); 769 _simd_store_si((simdscalari*)aRTAI, vRtaii); 770 } 771 else 772 { 773 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); 774 } 775 776 endBinTriangles: 777 778 // scan remaining valid triangles and bin each separately 779 while (_BitScanForward(&triIndex, triMask)) 780 { 781 uint32_t linkageCount = state.backendState.numAttributes; 782 uint32_t numScalarAttribs = linkageCount * 4; 783 784 BE_WORK work; 785 work.type = DRAW; 786 787 bool isDegenerate; 788 if (CT::IsConservativeT::value) 789 { 790 // only rasterize valid edges if we have a degenerate primitive 791 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; 792 work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), 793 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable, 794 (state.scissorsTileAligned == false)); 795 796 // Degenerate triangles are required to be constant interpolated 797 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; 798 } 799 else 800 { 801 isDegenerate = false; 802 work.pfnWork = pfnWork; 803 } 804 805 // Select attribute processor 806 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3, 807 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate); 808 809 TRIANGLE_WORK_DESC &desc = work.desc.tri; 810 811 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); 812 desc.triFlags.primID = pPrimID[triIndex]; 813 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; 814 desc.triFlags.viewportIndex = pViewportIndex[triIndex]; 815 816 auto pArena = pDC->pArena; 817 SWR_ASSERT(pArena != nullptr); 818 819 // store active attribs 820 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); 821 desc.pAttribs = pAttribs; 822 desc.numAttribs = linkageCount; 823 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs); 824 825 // store triangle vertex data 826 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); 827 828 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); 829 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); 830 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); 831 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); 832 833 // store user clip distances 834 if (rastState.clipDistanceMask) 835 { 836 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); 837 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); 838 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer); 839 } 840 841 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) 842 { 843 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) 844 { 845 #if KNOB_ENABLE_TOSS_POINTS 846 if (!KNOB_TOSS_SETUP_TRIS) 847 #endif 848 { 849 pTileMgr->enqueue(x, y, &work); 850 } 851 } 852 } 853 triMask &= ~(1 << triIndex); 854 } 855 856 AR_END(FEBinTriangles, 1); 857 } 858 859 struct FEBinTrianglesChooser 860 { 861 typedef PFN_PROCESS_PRIMS FuncType; 862 863 template <typename... ArgsB> 864 static FuncType GetFunc() 865 { 866 return BinTriangles<ConservativeRastFETraits<ArgsB...>>; 867 } 868 }; 869 870 // Selector for correct templated BinTrinagles function 871 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) 872 { 873 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative); 874 } 875 876 877 ////////////////////////////////////////////////////////////////////////// 878 /// @brief Bin SIMD points to the backend. Only supports point size of 1 879 /// @param pDC - pointer to draw context. 880 /// @param pa - The primitive assembly object. 881 /// @param workerId - thread's worker id. Even thread has a unique id. 882 /// @param tri - Contains point position data for SIMDs worth of points. 883 /// @param primID - Primitive ID for each point. 884 void BinPoints( 885 DRAW_CONTEXT *pDC, 886 PA_STATE& pa, 887 uint32_t workerId, 888 simdvector prim[3], 889 uint32_t primMask, 890 simdscalari primID, 891 simdscalari viewportIdx) 892 { 893 SWR_CONTEXT *pContext = pDC->pContext; 894 895 AR_BEGIN(FEBinPoints, pDC->drawId); 896 897 simdvector& primVerts = prim[0]; 898 899 const API_STATE& state = GetApiState(pDC); 900 const SWR_FRONTEND_STATE& feState = state.frontendState; 901 const SWR_GS_STATE& gsState = state.gsState; 902 const SWR_RASTSTATE& rastState = state.rastState; 903 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; 904 905 // Select attribute processor 906 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, 907 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); 908 909 if (!feState.vpTransformDisable) 910 { 911 // perspective divide 912 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); 913 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); 914 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); 915 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); 916 917 // viewport transform to screen coords 918 if (state.gsState.emitsViewportArrayIndex) 919 { 920 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); 921 } 922 else 923 { 924 viewportTransform<1>(&primVerts, state.vpMatrices); 925 } 926 } 927 928 // adjust for pixel center location 929 simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; 930 primVerts.x = _simd_add_ps(primVerts.x, offset); 931 primVerts.y = _simd_add_ps(primVerts.y, offset); 932 933 // convert to fixed point 934 simdscalari vXi, vYi; 935 vXi = fpToFixedPointVertical(primVerts.x); 936 vYi = fpToFixedPointVertical(primVerts.y); 937 938 if (CanUseSimplePoints(pDC)) 939 { 940 // adjust for ymin-xmin rule 941 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); 942 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); 943 944 // cull points off the ymin-xmin edge of the viewport 945 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); 946 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); 947 948 // compute macro tile coordinates 949 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); 950 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); 951 952 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; 953 _simd_store_si((simdscalari*)aMacroX, macroX); 954 _simd_store_si((simdscalari*)aMacroY, macroY); 955 956 // compute raster tile coordinates 957 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); 958 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); 959 960 // compute raster tile relative x,y for coverage mask 961 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); 962 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); 963 964 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); 965 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); 966 967 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; 968 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; 969 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); 970 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); 971 972 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; 973 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; 974 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); 975 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); 976 977 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; 978 _simd_store_ps((float*)aZ, primVerts.z); 979 980 // store render target array index 981 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; 982 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) 983 { 984 simdvector vRtai; 985 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); 986 simdscalari vRtaii = _simd_castps_si(vRtai.x); 987 _simd_store_si((simdscalari*)aRTAI, vRtaii); 988 } 989 else 990 { 991 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); 992 } 993 994 uint32_t *pPrimID = (uint32_t *)&primID; 995 DWORD primIndex = 0; 996 997 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; 998 999 // scan remaining valid triangles and bin each separately 1000 while (_BitScanForward(&primIndex, primMask)) 1001 { 1002 uint32_t linkageCount = backendState.numAttributes; 1003 uint32_t numScalarAttribs = linkageCount * 4; 1004 1005 BE_WORK work; 1006 work.type = DRAW; 1007 1008 TRIANGLE_WORK_DESC &desc = work.desc.tri; 1009 1010 // points are always front facing 1011 desc.triFlags.frontFacing = 1; 1012 desc.triFlags.primID = pPrimID[primIndex]; 1013 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; 1014 desc.triFlags.viewportIndex = pViewportIndex[primIndex]; 1015 1016 work.pfnWork = RasterizeSimplePoint; 1017 1018 auto pArena = pDC->pArena; 1019 SWR_ASSERT(pArena != nullptr); 1020 1021 // store attributes 1022 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); 1023 desc.pAttribs = pAttribs; 1024 desc.numAttribs = linkageCount; 1025 1026 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs); 1027 1028 // store raster tile aligned x, y, perspective correct z 1029 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); 1030 desc.pTriBuffer = pTriBuffer; 1031 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; 1032 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; 1033 *pTriBuffer = aZ[primIndex]; 1034 1035 uint32_t tX = aTileRelativeX[primIndex]; 1036 uint32_t tY = aTileRelativeY[primIndex]; 1037 1038 // pack the relative x,y into the coverageMask, the rasterizer will 1039 // generate the true coverage mask from it 1040 work.desc.tri.triFlags.coverageMask = tX | (tY << 4); 1041 1042 // bin it 1043 MacroTileMgr *pTileMgr = pDC->pTileMgr; 1044 #if KNOB_ENABLE_TOSS_POINTS 1045 if (!KNOB_TOSS_SETUP_TRIS) 1046 #endif 1047 { 1048 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); 1049 } 1050 primMask &= ~(1 << primIndex); 1051 } 1052 } 1053 else 1054 { 1055 // non simple points need to be potentially binned to multiple macro tiles 1056 simdscalar vPointSize; 1057 if (rastState.pointParam) 1058 { 1059 simdvector size[3]; 1060 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size); 1061 vPointSize = size[0].x; 1062 } 1063 else 1064 { 1065 vPointSize = _simd_set1_ps(rastState.pointSize); 1066 } 1067 1068 // bloat point to bbox 1069 simdBBox bbox; 1070 bbox.xmin = bbox.xmax = vXi; 1071 bbox.ymin = bbox.ymax = vYi; 1072 1073 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f)); 1074 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); 1075 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); 1076 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); 1077 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); 1078 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); 1079 1080 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. 1081 // Gather the AOS effective scissor rects based on the per-prim VP index. 1082 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. 1083 simdscalari scisXmin, scisYmin, scisXmax, scisYmax; 1084 if (state.gsState.emitsViewportArrayIndex) 1085 { 1086 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, 1087 scisXmin, scisYmin, scisXmax, scisYmax); 1088 } 1089 else // broadcast fast path for non-VPAI case. 1090 { 1091 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); 1092 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); 1093 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); 1094 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); 1095 } 1096 1097 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); 1098 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); 1099 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); 1100 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); 1101 1102 // Cull bloated points completely outside scissor 1103 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); 1104 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); 1105 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); 1106 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); 1107 primMask = primMask & ~maskOutsideScissor; 1108 1109 // Convert bbox to macrotile units. 1110 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); 1111 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); 1112 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); 1113 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); 1114 1115 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; 1116 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); 1117 _simd_store_si((simdscalari*)aMTRight, bbox.xmax); 1118 _simd_store_si((simdscalari*)aMTTop, bbox.ymin); 1119 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); 1120 1121 // store render target array index 1122 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; 1123 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) 1124 { 1125 simdvector vRtai[2]; 1126 pa.Assemble(VERTEX_RTAI_SLOT, vRtai); 1127 simdscalari vRtaii = _simd_castps_si(vRtai[0].x); 1128 _simd_store_si((simdscalari*)aRTAI, vRtaii); 1129 } 1130 else 1131 { 1132 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); 1133 } 1134 1135 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH]; 1136 _simd_store_ps((float*)aPointSize, vPointSize); 1137 1138 uint32_t *pPrimID = (uint32_t *)&primID; 1139 1140 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH]; 1141 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH]; 1142 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH]; 1143 1144 _simd_store_ps((float*)aPrimVertsX, primVerts.x); 1145 _simd_store_ps((float*)aPrimVertsY, primVerts.y); 1146 _simd_store_ps((float*)aPrimVertsZ, primVerts.z); 1147 1148 // scan remaining valid prims and bin each separately 1149 const SWR_BACKEND_STATE& backendState = state.backendState; 1150 DWORD primIndex; 1151 while (_BitScanForward(&primIndex, primMask)) 1152 { 1153 uint32_t linkageCount = backendState.numAttributes; 1154 uint32_t numScalarAttribs = linkageCount * 4; 1155 1156 BE_WORK work; 1157 work.type = DRAW; 1158 1159 TRIANGLE_WORK_DESC &desc = work.desc.tri; 1160 1161 desc.triFlags.frontFacing = 1; 1162 desc.triFlags.primID = pPrimID[primIndex]; 1163 desc.triFlags.pointSize = aPointSize[primIndex]; 1164 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; 1165 desc.triFlags.viewportIndex = pViewportIndex[primIndex]; 1166 1167 work.pfnWork = RasterizeTriPoint; 1168 1169 auto pArena = pDC->pArena; 1170 SWR_ASSERT(pArena != nullptr); 1171 1172 // store active attribs 1173 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); 1174 desc.numAttribs = linkageCount; 1175 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); 1176 1177 // store point vertex data 1178 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); 1179 desc.pTriBuffer = pTriBuffer; 1180 *pTriBuffer++ = aPrimVertsX[primIndex]; 1181 *pTriBuffer++ = aPrimVertsY[primIndex]; 1182 *pTriBuffer = aPrimVertsZ[primIndex]; 1183 1184 // store user clip distances 1185 if (rastState.clipDistanceMask) 1186 { 1187 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); 1188 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); 1189 float dists[8]; 1190 float one = 1.0f; 1191 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists); 1192 for (uint32_t i = 0; i < numClipDist; i++) { 1193 desc.pUserClipBuffer[3*i + 0] = 0.0f; 1194 desc.pUserClipBuffer[3*i + 1] = 0.0f; 1195 desc.pUserClipBuffer[3*i + 2] = dists[i]; 1196 } 1197 } 1198 1199 MacroTileMgr *pTileMgr = pDC->pTileMgr; 1200 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) 1201 { 1202 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) 1203 { 1204 #if KNOB_ENABLE_TOSS_POINTS 1205 if (!KNOB_TOSS_SETUP_TRIS) 1206 #endif 1207 { 1208 pTileMgr->enqueue(x, y, &work); 1209 } 1210 } 1211 } 1212 1213 primMask &= ~(1 << primIndex); 1214 } 1215 } 1216 1217 AR_END(FEBinPoints, 1); 1218 } 1219 1220 ////////////////////////////////////////////////////////////////////////// 1221 /// @brief Bin SIMD lines to the backend. 1222 /// @param pDC - pointer to draw context. 1223 /// @param pa - The primitive assembly object. 1224 /// @param workerId - thread's worker id. Even thread has a unique id. 1225 /// @param tri - Contains line position data for SIMDs worth of points. 1226 /// @param primID - Primitive ID for each line. 1227 /// @param viewportIdx - Viewport Array Index for each line. 1228 void BinPostSetupLines( 1229 DRAW_CONTEXT *pDC, 1230 PA_STATE& pa, 1231 uint32_t workerId, 1232 simdvector prim[], 1233 simdscalar recipW[], 1234 uint32_t primMask, 1235 simdscalari primID, 1236 simdscalari viewportIdx) 1237 { 1238 SWR_CONTEXT *pContext = pDC->pContext; 1239 1240 AR_BEGIN(FEBinLines, pDC->drawId); 1241 1242 const API_STATE& state = GetApiState(pDC); 1243 const SWR_RASTSTATE& rastState = state.rastState; 1244 const SWR_FRONTEND_STATE& feState = state.frontendState; 1245 const SWR_GS_STATE& gsState = state.gsState; 1246 1247 // Select attribute processor 1248 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, 1249 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); 1250 1251 simdscalar& vRecipW0 = recipW[0]; 1252 simdscalar& vRecipW1 = recipW[1]; 1253 1254 // convert to fixed point 1255 simdscalari vXi[2], vYi[2]; 1256 vXi[0] = fpToFixedPointVertical(prim[0].x); 1257 vYi[0] = fpToFixedPointVertical(prim[0].y); 1258 vXi[1] = fpToFixedPointVertical(prim[1].x); 1259 vYi[1] = fpToFixedPointVertical(prim[1].y); 1260 1261 // compute x-major vs y-major mask 1262 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1])); 1263 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1])); 1264 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength)); 1265 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask); 1266 1267 // cull zero-length lines 1268 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si()); 1269 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si())); 1270 1271 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); 1272 1273 uint32_t *pPrimID = (uint32_t *)&primID; 1274 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; 1275 1276 simdscalar vUnused = _simd_setzero_ps(); 1277 1278 // Calc bounding box of lines 1279 simdBBox bbox; 1280 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]); 1281 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]); 1282 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]); 1283 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]); 1284 1285 // bloat bbox by line width along minor axis 1286 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f); 1287 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); 1288 simdBBox bloatBox; 1289 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); 1290 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); 1291 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); 1292 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); 1293 1294 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); 1295 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); 1296 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); 1297 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); 1298 1299 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. 1300 simdscalari scisXmin, scisYmin, scisXmax, scisYmax; 1301 if (state.gsState.emitsViewportArrayIndex) 1302 { 1303 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, 1304 scisXmin, scisYmin, scisXmax, scisYmax); 1305 } 1306 else // broadcast fast path for non-VPAI case. 1307 { 1308 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); 1309 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); 1310 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); 1311 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); 1312 } 1313 1314 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); 1315 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); 1316 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); 1317 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); 1318 1319 // Cull prims completely outside scissor 1320 { 1321 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); 1322 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); 1323 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); 1324 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); 1325 primMask = primMask & ~maskOutsideScissor; 1326 } 1327 1328 if (!primMask) 1329 { 1330 goto endBinLines; 1331 } 1332 1333 // Convert triangle bbox to macrotile units. 1334 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); 1335 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); 1336 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); 1337 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); 1338 1339 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; 1340 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); 1341 _simd_store_si((simdscalari*)aMTRight, bbox.xmax); 1342 _simd_store_si((simdscalari*)aMTTop, bbox.ymin); 1343 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); 1344 1345 // transpose verts needed for backend 1346 /// @todo modify BE to take non-transformed verts 1347 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; 1348 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); 1349 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); 1350 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); 1351 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused); 1352 1353 // store render target array index 1354 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; 1355 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) 1356 { 1357 simdvector vRtai[2]; 1358 pa.Assemble(VERTEX_RTAI_SLOT, vRtai); 1359 simdscalari vRtaii = _simd_castps_si(vRtai[0].x); 1360 _simd_store_si((simdscalari*)aRTAI, vRtaii); 1361 } 1362 else 1363 { 1364 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); 1365 } 1366 1367 // scan remaining valid prims and bin each separately 1368 DWORD primIndex; 1369 while (_BitScanForward(&primIndex, primMask)) 1370 { 1371 uint32_t linkageCount = state.backendState.numAttributes; 1372 uint32_t numScalarAttribs = linkageCount * 4; 1373 1374 BE_WORK work; 1375 work.type = DRAW; 1376 1377 TRIANGLE_WORK_DESC &desc = work.desc.tri; 1378 1379 desc.triFlags.frontFacing = 1; 1380 desc.triFlags.primID = pPrimID[primIndex]; 1381 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; 1382 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; 1383 desc.triFlags.viewportIndex = pViewportIndex[primIndex]; 1384 1385 work.pfnWork = RasterizeLine; 1386 1387 auto pArena = pDC->pArena; 1388 SWR_ASSERT(pArena != nullptr); 1389 1390 // store active attribs 1391 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); 1392 desc.numAttribs = linkageCount; 1393 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); 1394 1395 // store line vertex data 1396 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); 1397 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); 1398 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); 1399 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); 1400 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); 1401 1402 // store user clip distances 1403 if (rastState.clipDistanceMask) 1404 { 1405 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); 1406 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); 1407 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer); 1408 } 1409 1410 MacroTileMgr *pTileMgr = pDC->pTileMgr; 1411 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) 1412 { 1413 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) 1414 { 1415 #if KNOB_ENABLE_TOSS_POINTS 1416 if (!KNOB_TOSS_SETUP_TRIS) 1417 #endif 1418 { 1419 pTileMgr->enqueue(x, y, &work); 1420 } 1421 } 1422 } 1423 1424 primMask &= ~(1 << primIndex); 1425 } 1426 1427 endBinLines: 1428 1429 AR_END(FEBinLines, 1); 1430 } 1431 1432 ////////////////////////////////////////////////////////////////////////// 1433 /// @brief Bin SIMD lines to the backend. 1434 /// @param pDC - pointer to draw context. 1435 /// @param pa - The primitive assembly object. 1436 /// @param workerId - thread's worker id. Even thread has a unique id. 1437 /// @param tri - Contains line position data for SIMDs worth of points. 1438 /// @param primID - Primitive ID for each line. 1439 /// @param viewportIdx - Viewport Array Index for each line. 1440 void BinLines( 1441 DRAW_CONTEXT *pDC, 1442 PA_STATE& pa, 1443 uint32_t workerId, 1444 simdvector prim[], 1445 uint32_t primMask, 1446 simdscalari primID, 1447 simdscalari viewportIdx) 1448 { 1449 SWR_CONTEXT *pContext = pDC->pContext; 1450 1451 const API_STATE& state = GetApiState(pDC); 1452 const SWR_RASTSTATE& rastState = state.rastState; 1453 const SWR_FRONTEND_STATE& feState = state.frontendState; 1454 const SWR_GS_STATE& gsState = state.gsState; 1455 1456 // Select attribute processor 1457 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, 1458 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); 1459 1460 simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) }; 1461 1462 if (!feState.vpTransformDisable) 1463 { 1464 // perspective divide 1465 vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w); 1466 vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w); 1467 1468 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]); 1469 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]); 1470 1471 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]); 1472 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]); 1473 1474 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]); 1475 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]); 1476 1477 // viewport transform to screen coords 1478 if (state.gsState.emitsViewportArrayIndex) 1479 { 1480 viewportTransform<2>(prim, state.vpMatrices, viewportIdx); 1481 } 1482 else 1483 { 1484 viewportTransform<2>(prim, state.vpMatrices); 1485 } 1486 } 1487 1488 // adjust for pixel center location 1489 simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; 1490 prim[0].x = _simd_add_ps(prim[0].x, offset); 1491 prim[0].y = _simd_add_ps(prim[0].y, offset); 1492 1493 prim[1].x = _simd_add_ps(prim[1].x, offset); 1494 prim[1].y = _simd_add_ps(prim[1].y, offset); 1495 1496 BinPostSetupLines( 1497 pDC, 1498 pa, 1499 workerId, 1500 prim, 1501 vRecipW, 1502 primMask, 1503 primID, 1504 viewportIdx); 1505 } 1506