Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file frontend.h
     24 *
     25 * @brief Definitions for Frontend which handles vertex processing,
     26 *        primitive assembly, clipping, binning, etc.
     27 *
     28 ******************************************************************************/
     29 #pragma once
     30 #include "context.h"
     31 #include "common/simdintrin.h"
     32 #include <type_traits>
     33 
     34 // Calculates the A and B coefficients for the 3 edges of the triangle
     35 //
     36 // maths for edge equations:
     37 //   standard form of a line in 2d
     38 //   Ax + By + C = 0
     39 //   A = y0 - y1
     40 //   B = x1 - x0
     41 //   C = x0y1 - x1y0
     42 INLINE
     43 void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
     44 {
     45     // vYsub = y1 y2 y0 dc
     46     __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
     47     // vY =    y0 y1 y2 dc
     48     vA = _mm_sub_ps(vY, vYsub);
     49 
     50     // Result:
     51     // A[0] = y0 - y1
     52     // A[1] = y1 - y2
     53     // A[2] = y2 - y0
     54 
     55     // vXsub = x1 x2 x0 dc
     56     __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
     57     // vX =    x0 x1 x2 dc
     58     vB = _mm_sub_ps(vXsub, vX);
     59 
     60     // Result:
     61     // B[0] = x1 - x0
     62     // B[1] = x2 - x1
     63     // B[2] = x0 - x2
     64 }
     65 
     66 INLINE
     67 void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
     68 {
     69     // generate edge equations
     70     // A = y0 - y1
     71     // B = x1 - x0
     72     // C = x0y1 - x1y0
     73     __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
     74     vA = _mm_sub_epi32(vY, vYsub);
     75 
     76     __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
     77     vB = _mm_sub_epi32(vXsub, vX);
     78 }
     79 
     80 INLINE
     81 void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
     82 {
     83     // A = y0 - y1
     84     // B = x1 - x0
     85     vA[0] = _simd_sub_epi32(vY[0], vY[1]);
     86     vA[1] = _simd_sub_epi32(vY[1], vY[2]);
     87     vA[2] = _simd_sub_epi32(vY[2], vY[0]);
     88 
     89     vB[0] = _simd_sub_epi32(vX[1], vX[0]);
     90     vB[1] = _simd_sub_epi32(vX[2], vX[1]);
     91     vB[2] = _simd_sub_epi32(vX[0], vX[2]);
     92 }
     93 
     94 #if ENABLE_AVX512_SIMD16
     95 INLINE
     96 void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari vY[3], simd16scalari(&vA)[3], simd16scalari(&vB)[3])
     97 {
     98     // A = y0 - y1
     99     // B = x1 - x0
    100     vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
    101     vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
    102     vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
    103 
    104     vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
    105     vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
    106     vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
    107 }
    108 
    109 #endif
    110 // Calculate the determinant of the triangle
    111 // 2 vectors between the 3 points: P, Q
    112 // Px = x0-x2, Py = y0-y2
    113 // Qx = x1-x2, Qy = y1-y2
    114 //       |Px Qx|
    115 // det = |     | = PxQy - PyQx
    116 //       |Py Qy|
    117 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
    118 //               try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
    119 //               : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
    120 //               : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
    121 //               : B[2]*A[1] - A[2]*B[1]
    122 INLINE
    123 float calcDeterminantInt(const __m128i vA, const __m128i vB)
    124 {
    125     // vAShuf = [A1, A0, A2, A0]
    126     __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
    127     // vBShuf = [B2, B0, B1, B0]
    128     __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
    129     // vMul = [A1*B2, B1*A2]
    130     __m128i vMul   = _mm_mul_epi32(vAShuf, vBShuf);
    131 
    132     // shuffle upper to lower
    133     // vMul2 = [B1*A2, B1*A2]
    134     __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
    135     //vMul = [A1*B2 - B1*A2]
    136     vMul = _mm_sub_epi64(vMul, vMul2);
    137 
    138     int64_t result;
    139     _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
    140 
    141     double dResult = (double)result;
    142     dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
    143 
    144     return (float)dResult;
    145 }
    146 
    147 INLINE
    148 void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
    149 {
    150     // refer to calcDeterminantInt comment for calculation explanation
    151 
    152     // A1*B2
    153     simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]);     // 0 0 1 1 4 4 5 5
    154     simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]);     // 2 2 3 3 6 6 7 7
    155 
    156     simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
    157     simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
    158 
    159     simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo);        // 0 1 4 5
    160     simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi);        // 2 3 6 7
    161 
    162     // B1*A2
    163     simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
    164     simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
    165 
    166     simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
    167     simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
    168 
    169     simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
    170     simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
    171 
    172     // A1*B2 - A2*B1
    173     simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
    174     simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
    175 
    176     // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
    177     simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
    178 
    179     // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
    180     simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
    181 
    182     pvDet[0] = vResultLo;
    183     pvDet[1] = vResultHi;
    184 }
    185 
    186 #if ENABLE_AVX512_SIMD16
    187 INLINE
    188 void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet)
    189 {
    190     // refer to calcDeterminantInt comment for calculation explanation
    191 
    192     // A1*B2
    193     simd16scalari vA1_lo = _simd16_unpacklo_epi32(vA[1], vA[1]);                // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
    194     simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]);                // X 2 X 3 X 6 X 7 X A X B X E X F
    195 
    196     simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
    197     simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
    198 
    199     simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo);                 // 0 1 4 5 8 9 C D (64b)
    200     simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi);                 // 2 3 6 7 A B E F
    201 
    202     // B1*A2
    203     simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
    204     simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
    205 
    206     simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
    207     simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
    208 
    209     simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo);
    210     simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
    211 
    212     // A1*B2 - A2*B1
    213     simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo);               // 0 1 4 5 8 9 C D (64b)
    214     simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi);               // 2 3 6 7 A B E F
    215 
    216     // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
    217     simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44);       // 0 1 4 5 2 3 6 7 (64b)
    218     simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE);       // 8 9 C D A B E F
    219 
    220     // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
    221     pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8);                   // 0 1 2 3 4 5 6 7 (64b)
    222     pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8);                   // 8 9 A B C D E F
    223 }
    224 
    225 #endif
    226 INLINE
    227 void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
    228 {
    229     // C = -Ax - By
    230     vC  = _mm_mul_ps(vA, vX);
    231     __m128 vCy = _mm_mul_ps(vB, vY);
    232     vC  = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
    233     vC  = _mm_sub_ps(vC, vCy);
    234 }
    235 
    236 template<uint32_t NumVerts>
    237 INLINE
    238 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
    239 {
    240     simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
    241     simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
    242     simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
    243     simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
    244     simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
    245     simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
    246 
    247     for (uint32_t i = 0; i < NumVerts; ++i)
    248     {
    249         v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
    250         v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
    251         v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
    252     }
    253 }
    254 
    255 #if USE_SIMD16_FRONTEND
    256 template<uint32_t NumVerts>
    257 INLINE
    258 void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
    259 {
    260     const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
    261     const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
    262     const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
    263     const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
    264     const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
    265     const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
    266 
    267     for (uint32_t i = 0; i < NumVerts; ++i)
    268     {
    269         v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
    270         v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
    271         v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
    272     }
    273 }
    274 
    275 #endif
    276 template<uint32_t NumVerts>
    277 INLINE
    278 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari const &vViewportIdx)
    279 {
    280     // perform a gather of each matrix element based on the viewport array indexes
    281     simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
    282     simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
    283     simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
    284     simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
    285     simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
    286     simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
    287 
    288     for (uint32_t i = 0; i < NumVerts; ++i)
    289     {
    290         v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
    291         v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
    292         v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
    293     }
    294 }
    295 
    296 #if USE_SIMD16_FRONTEND
    297 template<uint32_t NumVerts>
    298 INLINE
    299 void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari const &vViewportIdx)
    300 {
    301     // perform a gather of each matrix element based on the viewport array indexes
    302     const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
    303     const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
    304     const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
    305     const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
    306     const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
    307     const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
    308 
    309     for (uint32_t i = 0; i < NumVerts; ++i)
    310     {
    311         v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
    312         v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
    313         v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
    314     }
    315 }
    316 
    317 #endif
    318 INLINE
    319 void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox)
    320 {
    321     // Need horizontal fp min here
    322     __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
    323     __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
    324 
    325     __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
    326     __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
    327 
    328 
    329     __m128i vMinX = _mm_min_epi32(vX, vX1);
    330             vMinX = _mm_min_epi32(vMinX, vX2);
    331 
    332     __m128i vMaxX = _mm_max_epi32(vX, vX1);
    333             vMaxX = _mm_max_epi32(vMaxX, vX2);
    334 
    335     __m128i vMinY = _mm_min_epi32(vY, vY1);
    336             vMinY = _mm_min_epi32(vMinY, vY2);
    337 
    338     __m128i vMaxY = _mm_max_epi32(vY, vY1);
    339             vMaxY = _mm_max_epi32(vMaxY, vY2);
    340 
    341     bbox.xmin = _mm_extract_epi32(vMinX, 0);
    342     bbox.xmax = _mm_extract_epi32(vMaxX, 0);
    343     bbox.ymin = _mm_extract_epi32(vMinY, 0);
    344     bbox.ymax = _mm_extract_epi32(vMaxY, 0);
    345 }
    346 
    347 INLINE
    348 bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
    349 {
    350     const API_STATE& state = GetApiState(pDC);
    351 
    352     return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
    353             state.rastState.pointSize == 1.0f &&
    354             !state.rastState.pointParam &&
    355             !state.rastState.pointSpriteEnable &&
    356             !state.backendState.clipDistanceMask);
    357 }
    358 
    359 INLINE
    360 bool vHasNaN(const __m128& vec)
    361 {
    362     const __m128 result = _mm_cmpunord_ps(vec, vec);
    363     const int32_t mask = _mm_movemask_ps(result);
    364     return (mask != 0);
    365 }
    366 
    367 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
    368 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
    369 
    370 
    371 // ProcessDraw front-end function.  All combinations of parameter values are available
    372 PFN_FE_WORK_FUNC GetProcessDrawFunc(
    373     bool IsIndexed,
    374     bool IsCutIndexEnabled,
    375     bool HasTessellation,
    376     bool HasGeometryShader,
    377     bool HasStreamOut,
    378     bool HasRasterization);
    379 
    380 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
    381 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
    382 void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
    383 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
    384 void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
    385 
    386 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
    387 #if USE_SIMD16_FRONTEND
    388 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
    389 #endif
    390 
    391 struct PA_STATE_BASE;  // forward decl
    392 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
    393 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
    394 #if USE_SIMD16_FRONTEND
    395 void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
    396 void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
    397 #endif
    398 
    399