1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file frontend.h 24 * 25 * @brief Definitions for Frontend which handles vertex processing, 26 * primitive assembly, clipping, binning, etc. 27 * 28 ******************************************************************************/ 29 #pragma once 30 #include "context.h" 31 #include <type_traits> 32 33 // Calculates the A and B coefficients for the 3 edges of the triangle 34 // 35 // maths for edge equations: 36 // standard form of a line in 2d 37 // Ax + By + C = 0 38 // A = y0 - y1 39 // B = x1 - x0 40 // C = x0y1 - x1y0 41 INLINE 42 void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB) 43 { 44 // vYsub = y1 y2 y0 dc 45 __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1)); 46 // vY = y0 y1 y2 dc 47 vA = _mm_sub_ps(vY, vYsub); 48 49 // Result: 50 // A[0] = y0 - y1 51 // A[1] = y1 - y2 52 // A[2] = y2 - y0 53 54 // vXsub = x1 x2 x0 dc 55 __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1)); 56 // vX = x0 x1 x2 dc 57 vB = _mm_sub_ps(vXsub, vX); 58 59 // Result: 60 // B[0] = x1 - x0 61 // B[1] = x2 - x1 62 // B[2] = x0 - x2 63 } 64 65 INLINE 66 void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3]) 67 { 68 // generate edge equations 69 // A = y0 - y1 70 // B = x1 - x0 71 vA[0] = _simd_sub_ps(vY[0], vY[1]); 72 vA[1] = _simd_sub_ps(vY[1], vY[2]); 73 vA[2] = _simd_sub_ps(vY[2], vY[0]); 74 75 vB[0] = _simd_sub_ps(vX[1], vX[0]); 76 vB[1] = _simd_sub_ps(vX[2], vX[1]); 77 vB[2] = _simd_sub_ps(vX[0], vX[2]); 78 } 79 80 INLINE 81 void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB) 82 { 83 // generate edge equations 84 // A = y0 - y1 85 // B = x1 - x0 86 // C = x0y1 - x1y0 87 __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1)); 88 vA = _mm_sub_epi32(vY, vYsub); 89 90 __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1)); 91 vB = _mm_sub_epi32(vXsub, vX); 92 } 93 94 INLINE 95 void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3]) 96 { 97 // A = y0 - y1 98 // B = x1 - x0 99 vA[0] = _simd_sub_epi32(vY[0], vY[1]); 100 vA[1] = _simd_sub_epi32(vY[1], vY[2]); 101 vA[2] = _simd_sub_epi32(vY[2], vY[0]); 102 103 vB[0] = _simd_sub_epi32(vX[1], vX[0]); 104 vB[1] = _simd_sub_epi32(vX[2], vX[1]); 105 vB[2] = _simd_sub_epi32(vX[0], vX[2]); 106 } 107 // Calculate the determinant of the triangle 108 // 2 vectors between the 3 points: P, Q 109 // Px = x0-x2, Py = y0-y2 110 // Qx = x1-x2, Qy = y1-y2 111 // |Px Qx| 112 // det = | | = PxQy - PyQx 113 // |Py Qy| 114 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2) 115 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx 116 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1)) 117 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1) 118 // : B[2]*A[1] - A[2]*B[1] 119 INLINE 120 float calcDeterminantInt(const __m128i vA, const __m128i vB) 121 { 122 // vAShuf = [A1, A0, A2, A0] 123 __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1)); 124 // vBShuf = [B2, B0, B1, B0] 125 __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2)); 126 // vMul = [A1*B2, B1*A2] 127 __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); 128 129 // shuffle upper to lower 130 // vMul2 = [B1*A2, B1*A2] 131 __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2)); 132 //vMul = [A1*B2 - B1*A2] 133 vMul = _mm_sub_epi64(vMul, vMul2); 134 135 int64_t result; 136 _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul)); 137 138 double dResult = (double)result; 139 dResult = dResult * (1.0 / FIXED_POINT16_SCALE); 140 141 return (float)dResult; 142 } 143 144 INLINE 145 void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet) 146 { 147 // refer to calcDeterminantInt comment for calculation explanation 148 // A1*B2 149 simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 150 simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 151 152 simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]); 153 simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]); 154 155 simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 156 simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 157 158 // B1*A2 159 simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]); 160 simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]); 161 162 simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]); 163 simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]); 164 165 simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo); 166 simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi); 167 168 // A1*B2 - A2*B1 169 simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo); 170 simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi); 171 172 // shuffle 0 1 4 5 -> 0 1 2 3 173 simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20); 174 simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31); 175 176 pvDet[0] = vResultLo; 177 pvDet[1] = vResultHi; 178 } 179 180 INLINE 181 void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC) 182 { 183 // C = -Ax - By 184 vC = _mm_mul_ps(vA, vX); 185 __m128 vCy = _mm_mul_ps(vB, vY); 186 vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); 187 vC = _mm_sub_ps(vC, vCy); 188 } 189 190 INLINE 191 void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix) 192 { 193 vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00)); 194 vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30)); 195 196 vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11)); 197 vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31)); 198 199 vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22)); 200 vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32)); 201 } 202 203 template<uint32_t NumVerts> 204 INLINE 205 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) 206 { 207 simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]); 208 simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]); 209 simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]); 210 simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]); 211 simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]); 212 simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]); 213 214 for (uint32_t i = 0; i < NumVerts; ++i) 215 { 216 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); 217 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); 218 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); 219 } 220 } 221 222 template<uint32_t NumVerts> 223 INLINE 224 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx) 225 { 226 // perform a gather of each matrix element based on the viewport array indexes 227 simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); 228 simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4); 229 simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4); 230 simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4); 231 simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4); 232 simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4); 233 234 for (uint32_t i = 0; i < NumVerts; ++i) 235 { 236 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); 237 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); 238 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); 239 } 240 } 241 242 INLINE 243 void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox) 244 { 245 // Need horizontal fp min here 246 __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1)); 247 __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2)); 248 249 __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1)); 250 __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2)); 251 252 253 __m128i vMinX = _mm_min_epi32(vX, vX1); 254 vMinX = _mm_min_epi32(vMinX, vX2); 255 256 __m128i vMaxX = _mm_max_epi32(vX, vX1); 257 vMaxX = _mm_max_epi32(vMaxX, vX2); 258 259 __m128i vMinY = _mm_min_epi32(vY, vY1); 260 vMinY = _mm_min_epi32(vMinY, vY2); 261 262 __m128i vMaxY = _mm_max_epi32(vY, vY1); 263 vMaxY = _mm_max_epi32(vMaxY, vY2); 264 265 bbox.xmin = _mm_extract_epi32(vMinX, 0); 266 bbox.xmax = _mm_extract_epi32(vMaxX, 0); 267 bbox.ymin = _mm_extract_epi32(vMinY, 0); 268 bbox.ymax = _mm_extract_epi32(vMaxY, 0); 269 } 270 271 INLINE 272 bool CanUseSimplePoints(DRAW_CONTEXT *pDC) 273 { 274 const API_STATE& state = GetApiState(pDC); 275 276 return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X && 277 state.rastState.pointSize == 1.0f && 278 !state.rastState.pointParam && 279 !state.rastState.pointSpriteEnable); 280 } 281 282 INLINE 283 bool vHasNaN(const __m128& vec) 284 { 285 const __m128 result = _mm_cmpunord_ps(vec, vec); 286 const int32_t mask = _mm_movemask_ps(result); 287 return (mask != 0); 288 } 289 290 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements); 291 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts); 292 293 294 // ProcessDraw front-end function. All combinations of parameter values are available 295 PFN_FE_WORK_FUNC GetProcessDrawFunc( 296 bool IsIndexed, 297 bool IsCutIndexEnabled, 298 bool HasTessellation, 299 bool HasGeometryShader, 300 bool HasStreamOut, 301 bool HasRasterization); 302 303 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 304 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 305 void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 306 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 307 void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 308 309 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative); 310 311 struct PA_STATE_BASE; // forward decl 312 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); 313 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); 314 315