1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file clip.h 24 * 25 * @brief Definitions for clipping 26 * 27 ******************************************************************************/ 28 #pragma once 29 30 #include "common/simdintrin.h" 31 #include "core/context.h" 32 #include "core/pa.h" 33 #include "rdtsc_core.h" 34 35 // Temp storage used by the clipper 36 extern THREAD simdvertex tlsTempVertices[7]; 37 38 enum SWR_CLIPCODES 39 { 40 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. 41 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes. 42 #define CLIPCODE_SHIFT 23 43 FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT), 44 FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT), 45 FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT), 46 FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT), 47 48 FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT), 49 FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT), 50 51 NEGW = (0x40 << CLIPCODE_SHIFT), 52 53 GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1), 54 GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2), 55 GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4), 56 GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8) 57 }; 58 59 #define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR) 60 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW) 61 62 void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, 63 int *numVerts, float *pOutAttribs); 64 65 INLINE 66 void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes) 67 { 68 clipCodes = _simd_setzero_ps(); 69 70 // -w 71 simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f)); 72 73 // FRUSTUM_LEFT 74 simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW); 75 clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT))); 76 77 // FRUSTUM_TOP 78 vRes = _simd_cmplt_ps(vertex.y, vNegW); 79 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP)))); 80 81 // FRUSTUM_RIGHT 82 vRes = _simd_cmpgt_ps(vertex.x, vertex.w); 83 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT)))); 84 85 // FRUSTUM_BOTTOM 86 vRes = _simd_cmpgt_ps(vertex.y, vertex.w); 87 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM)))); 88 89 if (state.rastState.depthClipEnable) 90 { 91 // FRUSTUM_NEAR 92 // DX clips depth [0..w], GL clips [-w..w] 93 if (state.rastState.clipHalfZ) 94 { 95 vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps()); 96 } 97 else 98 { 99 vRes = _simd_cmplt_ps(vertex.z, vNegW); 100 } 101 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR)))); 102 103 // FRUSTUM_FAR 104 vRes = _simd_cmpgt_ps(vertex.z, vertex.w); 105 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR)))); 106 } 107 108 // NEGW 109 vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps()); 110 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW)))); 111 112 // GUARDBAND_LEFT 113 simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4)); 114 vRes = _simd_cmplt_ps(vertex.x, gbMult); 115 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT)))); 116 117 // GUARDBAND_TOP 118 gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4)); 119 vRes = _simd_cmplt_ps(vertex.y, gbMult); 120 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP)))); 121 122 // GUARDBAND_RIGHT 123 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4)); 124 vRes = _simd_cmpgt_ps(vertex.x, gbMult); 125 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT)))); 126 127 // GUARDBAND_BOTTOM 128 gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4)); 129 vRes = _simd_cmpgt_ps(vertex.y, gbMult); 130 clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM)))); 131 } 132 133 template<uint32_t NumVertsPerPrim> 134 class Clipper 135 { 136 public: 137 Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) : 138 workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC)) 139 { 140 static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim"); 141 } 142 143 void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes) 144 { 145 for (uint32_t i = 0; i < NumVertsPerPrim; ++i) 146 { 147 ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes); 148 } 149 } 150 151 simdscalar ComputeClipCodeIntersection() 152 { 153 simdscalar result = this->clipCodes[0]; 154 for (uint32_t i = 1; i < NumVertsPerPrim; ++i) 155 { 156 result = _simd_and_ps(result, this->clipCodes[i]); 157 } 158 return result; 159 } 160 161 simdscalar ComputeClipCodeUnion() 162 { 163 simdscalar result = this->clipCodes[0]; 164 for (uint32_t i = 1; i < NumVertsPerPrim; ++i) 165 { 166 result = _simd_or_ps(result, this->clipCodes[i]); 167 } 168 return result; 169 } 170 171 int ComputeNegWMask() 172 { 173 simdscalar clipCodeUnion = ComputeClipCodeUnion(); 174 clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW))); 175 return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps())); 176 } 177 178 int ComputeClipMask() 179 { 180 simdscalar clipUnion = ComputeClipCodeUnion(); 181 clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK))); 182 return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps())); 183 } 184 185 // clipper is responsible for culling any prims with NAN coordinates 186 int ComputeNaNMask(simdvector prim[]) 187 { 188 simdscalar vNanMask = _simd_setzero_ps(); 189 for (uint32_t e = 0; e < NumVertsPerPrim; ++e) 190 { 191 simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q); 192 vNanMask = _simd_or_ps(vNanMask, vNan01); 193 simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q); 194 vNanMask = _simd_or_ps(vNanMask, vNan23); 195 } 196 197 return _simd_movemask_ps(vNanMask); 198 } 199 200 int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[]) 201 { 202 uint8_t cullMask = this->state.rastState.cullDistanceMask; 203 simdscalar vClipCullMask = _simd_setzero_ps(); 204 DWORD index; 205 206 simdvector vClipCullDistLo[3]; 207 simdvector vClipCullDistHi[3]; 208 209 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo); 210 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi); 211 while (_BitScanForward(&index, cullMask)) 212 { 213 cullMask &= ~(1 << index); 214 uint32_t slot = index >> 2; 215 uint32_t component = index & 0x3; 216 217 simdscalar vCullMaskElem = _simd_set1_ps(-1.0f); 218 for (uint32_t e = 0; e < NumVertsPerPrim; ++e) 219 { 220 simdscalar vCullComp; 221 if (slot == 0) 222 { 223 vCullComp = vClipCullDistLo[e][component]; 224 } 225 else 226 { 227 vCullComp = vClipCullDistHi[e][component]; 228 } 229 230 // cull if cull distance < 0 || NAN 231 simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ); 232 vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull); 233 } 234 vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem); 235 } 236 237 // clipper should also discard any primitive with NAN clip distance 238 uint8_t clipMask = this->state.rastState.clipDistanceMask; 239 while (_BitScanForward(&index, clipMask)) 240 { 241 clipMask &= ~(1 << index); 242 uint32_t slot = index >> 2; 243 uint32_t component = index & 0x3; 244 245 for (uint32_t e = 0; e < NumVertsPerPrim; ++e) 246 { 247 simdscalar vClipComp; 248 if (slot == 0) 249 { 250 vClipComp = vClipCullDistLo[e][component]; 251 } 252 else 253 { 254 vClipComp = vClipCullDistHi[e][component]; 255 } 256 257 simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q); 258 vClipCullMask = _simd_or_ps(vClipCullMask, vClip); 259 } 260 } 261 262 return _simd_movemask_ps(vClipCullMask); 263 } 264 265 // clip SIMD primitives 266 void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx) 267 { 268 // input/output vertex store for clipper 269 simdvertex vertices[7]; // maximum 7 verts generated per triangle 270 271 LONG constantInterpMask = this->state.backendState.constantInterpolationMask; 272 uint32_t provokingVertex = 0; 273 if(pa.binTopology == TOP_TRIANGLE_FAN) 274 { 275 provokingVertex = this->state.frontendState.provokingVertex.triFan; 276 } 277 ///@todo: line topology for wireframe? 278 279 // assemble pos 280 simdvector tmpVector[NumVertsPerPrim]; 281 pa.Assemble(VERTEX_POSITION_SLOT, tmpVector); 282 for (uint32_t i = 0; i < NumVertsPerPrim; ++i) 283 { 284 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i]; 285 } 286 287 // assemble attribs 288 const SWR_BACKEND_STATE& backendState = this->state.backendState; 289 290 int32_t maxSlot = -1; 291 for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot) 292 { 293 // Compute absolute attrib slot in vertex array 294 uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot; 295 maxSlot = std::max<int32_t>(maxSlot, mapSlot); 296 uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot; 297 298 pa.Assemble(inputSlot, tmpVector); 299 300 // if constant interpolation enabled for this attribute, assign the provoking 301 // vertex values to all edges 302 if (_bittest(&constantInterpMask, slot)) 303 { 304 for (uint32_t i = 0; i < NumVertsPerPrim; ++i) 305 { 306 vertices[i].attrib[inputSlot] = tmpVector[provokingVertex]; 307 } 308 } 309 else 310 { 311 for (uint32_t i = 0; i < NumVertsPerPrim; ++i) 312 { 313 vertices[i].attrib[inputSlot] = tmpVector[i]; 314 } 315 } 316 } 317 318 // assemble user clip distances if enabled 319 if (this->state.rastState.clipDistanceMask & 0xf) 320 { 321 pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector); 322 for (uint32_t i = 0; i < NumVertsPerPrim; ++i) 323 { 324 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i]; 325 } 326 } 327 328 if (this->state.rastState.clipDistanceMask & 0xf0) 329 { 330 pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector); 331 for (uint32_t i = 0; i < NumVertsPerPrim; ++i) 332 { 333 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i]; 334 } 335 } 336 337 uint32_t numAttribs = maxSlot + 1; 338 339 simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); 340 341 // set up new PA for binning clipped primitives 342 PFN_PROCESS_PRIMS pfnBinFunc = nullptr; 343 PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; 344 if (NumVertsPerPrim == 3) 345 { 346 pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0)); 347 clipTopology = TOP_TRIANGLE_FAN; 348 349 // so that the binner knows to bloat wide points later 350 if (pa.binTopology == TOP_POINT_LIST) 351 clipTopology = TOP_POINT_LIST; 352 353 } 354 else if (NumVertsPerPrim == 2) 355 { 356 pfnBinFunc = BinLines; 357 clipTopology = TOP_LINE_LIST; 358 } 359 else 360 { 361 SWR_ASSERT(0 && "Unexpected points in clipper."); 362 } 363 364 uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts; 365 uint32_t* pPrimitiveId = (uint32_t*)&vPrimId; 366 uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx; 367 368 const simdscalari vOffsets = _mm256_set_epi32( 369 0 * sizeof(simdvertex), // unused lane 370 6 * sizeof(simdvertex), 371 5 * sizeof(simdvertex), 372 4 * sizeof(simdvertex), 373 3 * sizeof(simdvertex), 374 2 * sizeof(simdvertex), 375 1 * sizeof(simdvertex), 376 0 * sizeof(simdvertex)); 377 378 // only need to gather 7 verts 379 // @todo dynamic mask based on actual # of verts generated per lane 380 const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1); 381 382 uint32_t numClippedPrims = 0; 383 for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim) 384 { 385 uint32_t numEmittedVerts = pVertexCount[inputPrim]; 386 if (numEmittedVerts < NumVertsPerPrim) 387 { 388 continue; 389 } 390 SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper."); 391 392 uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts); 393 numClippedPrims += numEmittedPrims; 394 395 // tranpose clipper output so that each lane's vertices are in SIMD order 396 // set aside space for 2 vertices, as the PA will try to read up to 16 verts 397 // for triangle fan 398 simdvertex transposedPrims[2]; 399 400 // transpose pos 401 uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; 402 for (uint32_t c = 0; c < 4; ++c) 403 { 404 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); 405 pBase += sizeof(simdscalar); 406 } 407 408 // transpose attribs 409 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim; 410 for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) 411 { 412 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; 413 for (uint32_t c = 0; c < 4; ++c) 414 { 415 transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); 416 pBase += sizeof(simdscalar); 417 } 418 } 419 420 // transpose user clip distances if enabled 421 if (this->state.rastState.clipDistanceMask & 0xf) 422 { 423 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim; 424 for (uint32_t c = 0; c < 4; ++c) 425 { 426 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); 427 pBase += sizeof(simdscalar); 428 } 429 } 430 431 if (this->state.rastState.clipDistanceMask & 0xf0) 432 { 433 pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim; 434 for (uint32_t c = 0; c < 4; ++c) 435 { 436 transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); 437 pBase += sizeof(simdscalar); 438 } 439 } 440 441 PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology); 442 443 while (clipPa.GetNextStreamOutput()) 444 { 445 do 446 { 447 simdvector attrib[NumVertsPerPrim]; 448 bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib); 449 if (assemble) 450 { 451 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff }; 452 pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim])); 453 } 454 } while (clipPa.NextPrim()); 455 } 456 } 457 458 // update global pipeline stat 459 UPDATE_STAT_FE(CPrimitives, numClippedPrims); 460 } 461 462 // execute the clipper stage 463 void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) 464 { 465 SWR_ASSERT(pa.pDC != nullptr); 466 SWR_CONTEXT* pContext = pa.pDC->pContext; 467 468 // set up binner based on PA state 469 PFN_PROCESS_PRIMS pfnBinner; 470 switch (pa.binTopology) 471 { 472 case TOP_POINT_LIST: 473 pfnBinner = BinPoints; 474 break; 475 case TOP_LINE_LIST: 476 case TOP_LINE_STRIP: 477 case TOP_LINE_LOOP: 478 case TOP_LINE_LIST_ADJ: 479 case TOP_LISTSTRIP_ADJ: 480 pfnBinner = BinLines; 481 break; 482 default: 483 pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0)); 484 break; 485 }; 486 487 // update clipper invocations pipeline stat 488 uint32_t numInvoc = _mm_popcnt_u32(primMask); 489 UPDATE_STAT_FE(CInvocations, numInvoc); 490 491 ComputeClipCodes(prim, viewportIdx); 492 493 // cull prims with NAN coords 494 primMask &= ~ComputeNaNMask(prim); 495 496 // user cull distance cull 497 if (this->state.rastState.cullDistanceMask) 498 { 499 primMask &= ~ComputeUserClipCullMask(pa, prim); 500 } 501 502 // cull prims outside view frustum 503 simdscalar clipIntersection = ComputeClipCodeIntersection(); 504 int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps())); 505 506 // skip clipping for points 507 uint32_t clipMask = 0; 508 if (NumVertsPerPrim != 1) 509 { 510 clipMask = primMask & ComputeClipMask(); 511 } 512 513 if (clipMask) 514 { 515 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); 516 // we have to clip tris, execute the clipper, which will also 517 // call the binner 518 ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx); 519 AR_END(FEGuardbandClip, 1); 520 } 521 else if (validMask) 522 { 523 // update CPrimitives pipeline state 524 UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); 525 526 // forward valid prims directly to binner 527 pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx); 528 } 529 } 530 531 private: 532 inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1) 533 { 534 return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1)); 535 } 536 537 inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component) 538 { 539 const uint32_t simdVertexStride = sizeof(simdvertex); 540 const uint32_t componentStride = sizeof(simdscalar); 541 const uint32_t attribStride = sizeof(simdvector); 542 const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float), 543 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float)); 544 545 // step to the simdvertex 546 simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride)); 547 548 // step to the attribute and component 549 vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component)); 550 551 // step to the lane 552 vOffsets = _simd_add_epi32(vOffsets, vElemOffset); 553 554 return vOffsets; 555 } 556 557 // gathers a single component for a given attribute for each SIMD lane 558 inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component) 559 { 560 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); 561 simdscalar vSrc = _mm256_undefined_ps(); 562 return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1); 563 } 564 565 inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc) 566 { 567 simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); 568 569 uint32_t* pOffsets = (uint32_t*)&vOffsets; 570 float* pSrc = (float*)&vSrc; 571 uint32_t mask = _simd_movemask_ps(vMask); 572 DWORD lane; 573 while (_BitScanForward(&lane, mask)) 574 { 575 mask &= ~(1 << lane); 576 uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane]; 577 *(float*)pBuf = pSrc[lane]; 578 } 579 } 580 581 template<SWR_CLIPCODES ClippingPlane> 582 inline void intersect( 583 const simdscalar& vActiveMask, // active lanes to operate on 584 const simdscalari& s, // index to first edge vertex v0 in pInPts. 585 const simdscalari& p, // index to second edge vertex v1 in pInPts. 586 const simdvector& v1, // vertex 0 position 587 const simdvector& v2, // vertex 1 position 588 simdscalari& outIndex, // output index. 589 const float *pInVerts, // array of all the input positions. 590 uint32_t numInAttribs, // number of attributes per vertex. 591 float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. 592 { 593 // compute interpolation factor 594 simdscalar t; 595 switch (ClippingPlane) 596 { 597 case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break; 598 case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break; 599 case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break; 600 case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break; 601 case FRUSTUM_NEAR: 602 // DX Znear plane is 0, GL is -w 603 if (this->state.rastState.clipHalfZ) 604 { 605 t = ComputeInterpFactor(v1[2], v2[2]); 606 } 607 else 608 { 609 t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2])); 610 } 611 break; 612 case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break; 613 default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); 614 }; 615 616 // interpolate position and store 617 for (uint32_t c = 0; c < 4; ++c) 618 { 619 simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]); 620 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos); 621 } 622 623 // interpolate attributes and store 624 for (uint32_t a = 0; a < numInAttribs; ++a) 625 { 626 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; 627 for (uint32_t c = 0; c < 4; ++c) 628 { 629 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); 630 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); 631 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); 632 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); 633 } 634 } 635 636 // interpolate clip distance if enabled 637 if (this->state.rastState.clipDistanceMask & 0xf) 638 { 639 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT; 640 for (uint32_t c = 0; c < 4; ++c) 641 { 642 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); 643 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); 644 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); 645 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); 646 } 647 } 648 649 if (this->state.rastState.clipDistanceMask & 0xf0) 650 { 651 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT; 652 for (uint32_t c = 0; c < 4; ++c) 653 { 654 simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); 655 simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); 656 simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); 657 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); 658 } 659 } 660 } 661 662 template<SWR_CLIPCODES ClippingPlane> 663 inline simdscalar inside(const simdvector& v) 664 { 665 switch (ClippingPlane) 666 { 667 case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); 668 case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]); 669 case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); 670 case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]); 671 case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); 672 case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]); 673 default: 674 SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); 675 return _simd_setzero_ps(); 676 } 677 } 678 679 template<SWR_CLIPCODES ClippingPlane> 680 simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) 681 { 682 simdscalari vCurIndex = _simd_setzero_si(); 683 simdscalari vOutIndex = _simd_setzero_si(); 684 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); 685 686 while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty 687 { 688 simdscalari s = vCurIndex; 689 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1)); 690 simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p); 691 p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask))); 692 693 // gather position 694 simdvector vInPos0, vInPos1; 695 for (uint32_t c = 0; c < 4; ++c) 696 { 697 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); 698 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); 699 } 700 701 // compute inside mask 702 simdscalar s_in = inside<ClippingPlane>(vInPos0); 703 simdscalar p_in = inside<ClippingPlane>(vInPos1); 704 705 // compute intersection mask (s_in != p_in) 706 simdscalar intersectMask = _simd_xor_ps(s_in, p_in); 707 intersectMask = _simd_and_ps(intersectMask, vActiveMask); 708 709 // store s if inside 710 s_in = _simd_and_ps(s_in, vActiveMask); 711 if (!_simd_testz_ps(s_in, s_in)) 712 { 713 // store position 714 for (uint32_t c = 0; c < 4; ++c) 715 { 716 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); 717 } 718 719 // store attribs 720 for (uint32_t a = 0; a < numInAttribs; ++a) 721 { 722 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; 723 for (uint32_t c = 0; c < 4; ++c) 724 { 725 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); 726 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); 727 } 728 } 729 730 // store clip distance if enabled 731 if (this->state.rastState.clipDistanceMask & 0xf) 732 { 733 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT; 734 for (uint32_t c = 0; c < 4; ++c) 735 { 736 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); 737 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); 738 } 739 } 740 741 if (this->state.rastState.clipDistanceMask & 0xf0) 742 { 743 uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT; 744 for (uint32_t c = 0; c < 4; ++c) 745 { 746 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); 747 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); 748 } 749 } 750 751 // increment outIndex 752 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); 753 } 754 755 // compute and store intersection 756 if (!_simd_testz_ps(intersectMask, intersectMask)) 757 { 758 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); 759 760 // increment outIndex for active lanes 761 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask); 762 } 763 764 // increment loop index and update active mask 765 vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1)); 766 vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); 767 } 768 769 return vOutIndex; 770 } 771 772 template<SWR_CLIPCODES ClippingPlane> 773 simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) 774 { 775 simdscalari vCurIndex = _simd_setzero_si(); 776 simdscalari vOutIndex = _simd_setzero_si(); 777 simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); 778 779 if (!_simd_testz_ps(vActiveMask, vActiveMask)) 780 { 781 simdscalari s = vCurIndex; 782 simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1)); 783 784 // gather position 785 simdvector vInPos0, vInPos1; 786 for (uint32_t c = 0; c < 4; ++c) 787 { 788 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); 789 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); 790 } 791 792 // compute inside mask 793 simdscalar s_in = inside<ClippingPlane>(vInPos0); 794 simdscalar p_in = inside<ClippingPlane>(vInPos1); 795 796 // compute intersection mask (s_in != p_in) 797 simdscalar intersectMask = _simd_xor_ps(s_in, p_in); 798 intersectMask = _simd_and_ps(intersectMask, vActiveMask); 799 800 // store s if inside 801 s_in = _simd_and_ps(s_in, vActiveMask); 802 if (!_simd_testz_ps(s_in, s_in)) 803 { 804 for (uint32_t c = 0; c < 4; ++c) 805 { 806 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); 807 } 808 809 // interpolate attributes and store 810 for (uint32_t a = 0; a < numInAttribs; ++a) 811 { 812 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; 813 for (uint32_t c = 0; c < 4; ++c) 814 { 815 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); 816 ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); 817 } 818 } 819 820 // increment outIndex 821 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); 822 } 823 824 // compute and store intersection 825 if (!_simd_testz_ps(intersectMask, intersectMask)) 826 { 827 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); 828 829 // increment outIndex for active lanes 830 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask); 831 } 832 833 // store p if inside 834 p_in = _simd_and_ps(p_in, vActiveMask); 835 if (!_simd_testz_ps(p_in, p_in)) 836 { 837 for (uint32_t c = 0; c < 4; ++c) 838 { 839 ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); 840 } 841 842 // interpolate attributes and store 843 for (uint32_t a = 0; a < numInAttribs; ++a) 844 { 845 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; 846 for (uint32_t c = 0; c < 4; ++c) 847 { 848 simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); 849 ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib); 850 } 851 } 852 853 // increment outIndex 854 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in); 855 } 856 } 857 858 return vOutIndex; 859 } 860 861 ////////////////////////////////////////////////////////////////////////// 862 /// @brief Vertical clipper. Clips SIMD primitives at a time 863 /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer 864 /// @param vPrimMask - mask of valid input primitives, including non-clipped prims 865 /// @param numAttribs - number of valid input attribs, including position 866 simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs) 867 { 868 // temp storage 869 float* pTempVerts = (float*)&tlsTempVertices[0]; 870 871 // zero out num input verts for non-active lanes 872 simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim); 873 vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask); 874 875 // clip prims to frustum 876 simdscalari vNumOutPts; 877 if (NumVertsPerPrim == 3) 878 { 879 vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); 880 vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); 881 vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); 882 vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); 883 vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); 884 vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); 885 } 886 else 887 { 888 SWR_ASSERT(NumVertsPerPrim == 2); 889 vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts); 890 vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices); 891 vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts); 892 vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices); 893 vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts); 894 vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices); 895 } 896 897 // restore num verts for non-clipped, active lanes 898 simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask); 899 vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask); 900 901 return vNumOutPts; 902 } 903 904 const uint32_t workerId{ 0 }; 905 DRAW_CONTEXT* pDC{ nullptr }; 906 const API_STATE& state; 907 simdscalar clipCodes[NumVertsPerPrim]; 908 }; 909 910 911 // pipeline stage functions 912 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); 913 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); 914 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); 915