1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file frontend.h 24 * 25 * @brief Definitions for Frontend which handles vertex processing, 26 * primitive assembly, clipping, binning, etc. 27 * 28 ******************************************************************************/ 29 #pragma once 30 #include "context.h" 31 #include "common/simdintrin.h" 32 #include <type_traits> 33 34 // Calculates the A and B coefficients for the 3 edges of the triangle 35 // 36 // maths for edge equations: 37 // standard form of a line in 2d 38 // Ax + By + C = 0 39 // A = y0 - y1 40 // B = x1 - x0 41 // C = x0y1 - x1y0 42 INLINE 43 void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB) 44 { 45 // vYsub = y1 y2 y0 dc 46 __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1)); 47 // vY = y0 y1 y2 dc 48 vA = _mm_sub_ps(vY, vYsub); 49 50 // Result: 51 // A[0] = y0 - y1 52 // A[1] = y1 - y2 53 // A[2] = y2 - y0 54 55 // vXsub = x1 x2 x0 dc 56 __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1)); 57 // vX = x0 x1 x2 dc 58 vB = _mm_sub_ps(vXsub, vX); 59 60 // Result: 61 // B[0] = x1 - x0 62 // B[1] = x2 - x1 63 // B[2] = x0 - x2 64 } 65 66 INLINE 67 void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB) 68 { 69 // generate edge equations 70 // A = y0 - y1 71 // B = x1 - x0 72 // C = x0y1 - x1y0 73 __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1)); 74 vA = _mm_sub_epi32(vY, vYsub); 75 76 __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1)); 77 vB = _mm_sub_epi32(vXsub, vX); 78 } 79 80 INLINE 81 void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3]) 82 { 83 // A = y0 - y1 84 // B = x1 - x0 85 vA[0] = _simd_sub_epi32(vY[0], vY[1]); 86 vA[1] = _simd_sub_epi32(vY[1], vY[2]); 87 vA[2] = _simd_sub_epi32(vY[2], vY[0]); 88 89 vB[0] = _simd_sub_epi32(vX[1], vX[0]); 90 vB[1] = _simd_sub_epi32(vX[2], vX[1]); 91 vB[2] = _simd_sub_epi32(vX[0], vX[2]); 92 } 93 94 #if ENABLE_AVX512_SIMD16 95 INLINE 96 void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari vY[3], simd16scalari(&vA)[3], simd16scalari(&vB)[3]) 97 { 98 // A = y0 - y1 99 // B = x1 - x0 100 vA[0] = _simd16_sub_epi32(vY[0], vY[1]); 101 vA[1] = _simd16_sub_epi32(vY[1], vY[2]); 102 vA[2] = _simd16_sub_epi32(vY[2], vY[0]); 103 104 vB[0] = _simd16_sub_epi32(vX[1], vX[0]); 105 vB[1] = _simd16_sub_epi32(vX[2], vX[1]); 106 vB[2] = _simd16_sub_epi32(vX[0], vX[2]); 107 } 108 109 #endif 110 // Calculate the determinant of the triangle 111 // 2 vectors between the 3 points: P, Q 112 // Px = x0-x2, Py = y0-y2 113 // Qx = x1-x2, Qy = y1-y2 114 // |Px Qx| 115 // det = | | = PxQy - PyQx 116 // |Py Qy| 117 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2) 118 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx 119 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1)) 120 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1) 121 // : B[2]*A[1] - A[2]*B[1] 122 INLINE 123 float calcDeterminantInt(const __m128i vA, const __m128i vB) 124 { 125 // vAShuf = [A1, A0, A2, A0] 126 __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1)); 127 // vBShuf = [B2, B0, B1, B0] 128 __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2)); 129 // vMul = [A1*B2, B1*A2] 130 __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); 131 132 // shuffle upper to lower 133 // vMul2 = [B1*A2, B1*A2] 134 __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2)); 135 //vMul = [A1*B2 - B1*A2] 136 vMul = _mm_sub_epi64(vMul, vMul2); 137 138 int64_t result; 139 _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul)); 140 141 double dResult = (double)result; 142 dResult = dResult * (1.0 / FIXED_POINT16_SCALE); 143 144 return (float)dResult; 145 } 146 147 INLINE 148 void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet) 149 { 150 // refer to calcDeterminantInt comment for calculation explanation 151 152 // A1*B2 153 simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 154 simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 155 156 simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]); 157 simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]); 158 159 simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 160 simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 161 162 // B1*A2 163 simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]); 164 simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]); 165 166 simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]); 167 simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]); 168 169 simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo); 170 simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi); 171 172 // A1*B2 - A2*B1 173 simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo); 174 simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi); 175 176 // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3 177 simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20); 178 179 // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7 180 simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31); 181 182 pvDet[0] = vResultLo; 183 pvDet[1] = vResultHi; 184 } 185 186 #if ENABLE_AVX512_SIMD16 187 INLINE 188 void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet) 189 { 190 // refer to calcDeterminantInt comment for calculation explanation 191 192 // A1*B2 193 simd16scalari vA1_lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b) 194 simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F 195 196 simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]); 197 simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]); 198 199 simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b) 200 simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F 201 202 // B1*A2 203 simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]); 204 simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]); 205 206 simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]); 207 simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]); 208 209 simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo); 210 simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi); 211 212 // A1*B2 - A2*B1 213 simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b) 214 simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F 215 216 // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE 217 simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b) 218 simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F 219 220 // (3, 1, 2, 0) = 11 01 10 00 = 0xD8 221 pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b) 222 pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F 223 } 224 225 #endif 226 INLINE 227 void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC) 228 { 229 // C = -Ax - By 230 vC = _mm_mul_ps(vA, vX); 231 __m128 vCy = _mm_mul_ps(vB, vY); 232 vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); 233 vC = _mm_sub_ps(vC, vCy); 234 } 235 236 template<uint32_t NumVerts> 237 INLINE 238 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) 239 { 240 simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]); 241 simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]); 242 simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]); 243 simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]); 244 simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]); 245 simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]); 246 247 for (uint32_t i = 0; i < NumVerts; ++i) 248 { 249 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); 250 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); 251 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); 252 } 253 } 254 255 #if USE_SIMD16_FRONTEND 256 template<uint32_t NumVerts> 257 INLINE 258 void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) 259 { 260 const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]); 261 const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]); 262 const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]); 263 const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]); 264 const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]); 265 const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]); 266 267 for (uint32_t i = 0; i < NumVerts; ++i) 268 { 269 v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30); 270 v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31); 271 v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32); 272 } 273 } 274 275 #endif 276 template<uint32_t NumVerts> 277 INLINE 278 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari const &vViewportIdx) 279 { 280 // perform a gather of each matrix element based on the viewport array indexes 281 simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); 282 simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4); 283 simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4); 284 simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4); 285 simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4); 286 simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4); 287 288 for (uint32_t i = 0; i < NumVerts; ++i) 289 { 290 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); 291 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); 292 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); 293 } 294 } 295 296 #if USE_SIMD16_FRONTEND 297 template<uint32_t NumVerts> 298 INLINE 299 void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari const &vViewportIdx) 300 { 301 // perform a gather of each matrix element based on the viewport array indexes 302 const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); 303 const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4); 304 const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4); 305 const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4); 306 const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4); 307 const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4); 308 309 for (uint32_t i = 0; i < NumVerts; ++i) 310 { 311 v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30); 312 v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31); 313 v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32); 314 } 315 } 316 317 #endif 318 INLINE 319 void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox) 320 { 321 // Need horizontal fp min here 322 __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1)); 323 __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2)); 324 325 __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1)); 326 __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2)); 327 328 329 __m128i vMinX = _mm_min_epi32(vX, vX1); 330 vMinX = _mm_min_epi32(vMinX, vX2); 331 332 __m128i vMaxX = _mm_max_epi32(vX, vX1); 333 vMaxX = _mm_max_epi32(vMaxX, vX2); 334 335 __m128i vMinY = _mm_min_epi32(vY, vY1); 336 vMinY = _mm_min_epi32(vMinY, vY2); 337 338 __m128i vMaxY = _mm_max_epi32(vY, vY1); 339 vMaxY = _mm_max_epi32(vMaxY, vY2); 340 341 bbox.xmin = _mm_extract_epi32(vMinX, 0); 342 bbox.xmax = _mm_extract_epi32(vMaxX, 0); 343 bbox.ymin = _mm_extract_epi32(vMinY, 0); 344 bbox.ymax = _mm_extract_epi32(vMaxY, 0); 345 } 346 347 INLINE 348 bool CanUseSimplePoints(DRAW_CONTEXT *pDC) 349 { 350 const API_STATE& state = GetApiState(pDC); 351 352 return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X && 353 state.rastState.pointSize == 1.0f && 354 !state.rastState.pointParam && 355 !state.rastState.pointSpriteEnable && 356 !state.backendState.clipDistanceMask); 357 } 358 359 INLINE 360 bool vHasNaN(const __m128& vec) 361 { 362 const __m128 result = _mm_cmpunord_ps(vec, vec); 363 const int32_t mask = _mm_movemask_ps(result); 364 return (mask != 0); 365 } 366 367 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements); 368 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts); 369 370 371 // ProcessDraw front-end function. All combinations of parameter values are available 372 PFN_FE_WORK_FUNC GetProcessDrawFunc( 373 bool IsIndexed, 374 bool IsCutIndexEnabled, 375 bool HasTessellation, 376 bool HasGeometryShader, 377 bool HasStreamOut, 378 bool HasRasterization); 379 380 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 381 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 382 void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 383 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 384 void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 385 386 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative); 387 #if USE_SIMD16_FRONTEND 388 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative); 389 #endif 390 391 struct PA_STATE_BASE; // forward decl 392 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx); 393 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx); 394 #if USE_SIMD16_FRONTEND 395 void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); 396 void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx); 397 #endif 398 399