Home | History | Annotate | Download | only in memory
      1 /****************************************************************************
      2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file StoreTile.h
     24 *
     25 * @brief Functionality for Store.
     26 *
     27 ******************************************************************************/
     28 #pragma once
     29 
     30 #include "common/os.h"
     31 #include "common/formats.h"
     32 #include "core/context.h"
     33 #include "core/rdtsc_core.h"
     34 #include "core/format_conversion.h"
     35 
     36 #include "memory/TilingFunctions.h"
     37 #include "memory/Convert.h"
     38 #include "core/multisample.h"
     39 
     40 #include <array>
     41 #include <sstream>
     42 
     43 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
     44 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
     45 
     46 //////////////////////////////////////////////////////////////////////////
     47 /// Store Raster Tile Function Tables.
     48 //////////////////////////////////////////////////////////////////////////
     49 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
     50 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
     51 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
     52 
     53 void InitStoreTilesTable_Linear_1();
     54 void InitStoreTilesTable_Linear_2();
     55 void InitStoreTilesTable_TileX_1();
     56 void InitStoreTilesTable_TileX_2();
     57 void InitStoreTilesTable_TileY_1();
     58 void InitStoreTilesTable_TileY_2();
     59 void InitStoreTilesTable_TileW();
     60 void InitStoreTilesTable();
     61 
     62 //////////////////////////////////////////////////////////////////////////
     63 /// StorePixels
     64 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
     65 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
     66 /// @param ppDsts   - Array of destination pointers.  Each pointer is
     67 ///                   to a single row of at most 16B.
     68 /// @tparam NumDests - Number of destination pointers.  Each pair of
     69 ///                    pointers is for a 16-byte column of two rows.
     70 //////////////////////////////////////////////////////////////////////////
     71 template <size_t PixelSize, size_t NumDests>
     72 struct StorePixels
     73 {
     74     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
     75 };
     76 
     77 //////////////////////////////////////////////////////////////////////////
     78 /// StorePixels (32-bit pixel specialization)
     79 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
     80 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
     81 /// @param ppDsts   - Array of destination pointers.  Each pointer is
     82 ///                   to a single row of at most 16B.
     83 /// @tparam NumDests - Number of destination pointers.  Each pair of
     84 ///                    pointers is for a 16-byte column of two rows.
     85 //////////////////////////////////////////////////////////////////////////
     86 template <>
     87 struct StorePixels<8, 2>
     88 {
     89     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
     90     {
     91         // Each 4-pixel row is 4 bytes.
     92         const uint16_t* pPixSrc = (const uint16_t*)pSrc;
     93 
     94         // Unswizzle from SWR-Z order
     95         uint16_t* pRow = (uint16_t*)ppDsts[0];
     96         pRow[0] = pPixSrc[0];
     97         pRow[1] = pPixSrc[2];
     98 
     99         pRow = (uint16_t*)ppDsts[1];
    100         pRow[0] = pPixSrc[1];
    101         pRow[1] = pPixSrc[3];
    102     }
    103 };
    104 
    105 #if USE_8x2_TILE_BACKEND
    106 template <>
    107 struct StorePixels<8, 4>
    108 {
    109     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    110     {
    111         // 8 x 2 bytes = 16 bytes, 16 pixels
    112         const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
    113 
    114         uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
    115 
    116         // Unswizzle from SWR-Z order
    117         ppDsts16[0][0] = pSrc16[0];     // 0 1
    118         ppDsts16[0][1] = pSrc16[2];     // 4 5
    119 
    120         ppDsts16[1][0] = pSrc16[1];     // 2 3
    121         ppDsts16[1][1] = pSrc16[3];     // 6 7
    122 
    123         ppDsts16[2][0] = pSrc16[4];     // 8 9
    124         ppDsts16[2][1] = pSrc16[6];     // C D
    125 
    126         ppDsts16[3][0] = pSrc16[5];     // A B
    127         ppDsts16[3][1] = pSrc16[7];     // E F
    128     }
    129 };
    130 
    131 #endif
    132 //////////////////////////////////////////////////////////////////////////
    133 /// StorePixels (32-bit pixel specialization)
    134 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
    135 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
    136 /// @param ppDsts   - Array of destination pointers.  Each pointer is
    137 ///                   to a single row of at most 16B.
    138 /// @tparam NumDests - Number of destination pointers.  Each pair of
    139 ///                    pointers is for a 16-byte column of two rows.
    140 //////////////////////////////////////////////////////////////////////////
    141 template <>
    142 struct StorePixels<16, 2>
    143 {
    144     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
    145     {
    146         // Each 4-pixel row is 8 bytes.
    147         const uint32_t* pPixSrc = (const uint32_t*)pSrc;
    148 
    149         // Unswizzle from SWR-Z order
    150         uint32_t* pRow = (uint32_t*)ppDsts[0];
    151         pRow[0] = pPixSrc[0];
    152         pRow[1] = pPixSrc[2];
    153 
    154         pRow = (uint32_t*)ppDsts[1];
    155         pRow[0] = pPixSrc[1];
    156         pRow[1] = pPixSrc[3];
    157     }
    158 };
    159 
    160 #if USE_8x2_TILE_BACKEND
    161 template <>
    162 struct StorePixels<16, 4>
    163 {
    164     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    165     {
    166         // 8 x 4 bytes = 32 bytes, 16 pixels
    167         const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
    168 
    169         uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
    170 
    171         // Unswizzle from SWR-Z order
    172         ppDsts32[0][0] = pSrc32[0];     // 0 1
    173         ppDsts32[0][1] = pSrc32[2];     // 4 5
    174 
    175         ppDsts32[1][0] = pSrc32[1];     // 2 3
    176         ppDsts32[1][1] = pSrc32[3];     // 6 7
    177 
    178         ppDsts32[2][0] = pSrc32[4];     // 8 9
    179         ppDsts32[2][1] = pSrc32[6];     // C D
    180 
    181         ppDsts32[3][0] = pSrc32[5];     // A B
    182         ppDsts32[3][1] = pSrc32[7];     // E F
    183     }
    184 };
    185 
    186 #endif
    187 //////////////////////////////////////////////////////////////////////////
    188 /// StorePixels (32-bit pixel specialization)
    189 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
    190 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
    191 /// @param ppDsts   - Array of destination pointers.  Each pointer is
    192 ///                   to a single row of at most 16B.
    193 /// @tparam NumDests - Number of destination pointers.  Each pair of
    194 ///                    pointers is for a 16-byte column of two rows.
    195 //////////////////////////////////////////////////////////////////////////
    196 template <>
    197 struct StorePixels<32, 2>
    198 {
    199     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
    200     {
    201         // Each 4-pixel row is 16-bytes
    202         __m128i *pZRow01 = (__m128i*)pSrc;
    203         __m128i vQuad00 = _mm_load_si128(pZRow01);
    204         __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
    205 
    206         __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
    207         __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
    208 
    209         _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
    210         _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
    211     }
    212 };
    213 
    214 #if USE_8x2_TILE_BACKEND
    215 template <>
    216 struct StorePixels<32, 4>
    217 {
    218     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    219     {
    220         // 4 x 16 bytes = 64 bytes, 16 pixels
    221         const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
    222 
    223         __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
    224 
    225         // Unswizzle from SWR-Z order
    226         __m128i quad0 = _mm_load_si128(&pSrc128[0]);                        // 0 1 2 3
    227         __m128i quad1 = _mm_load_si128(&pSrc128[1]);                        // 4 5 6 7
    228         __m128i quad2 = _mm_load_si128(&pSrc128[2]);                        // 8 9 A B
    229         __m128i quad3 = _mm_load_si128(&pSrc128[3]);                        // C D E F
    230 
    231         _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1));   // 0 1 4 5
    232         _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1));   // 2 3 6 7
    233         _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3));   // 8 9 C D
    234         _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3));   // A B E F
    235     }
    236 };
    237 
    238 #endif
    239 //////////////////////////////////////////////////////////////////////////
    240 /// StorePixels (32-bit pixel specialization)
    241 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
    242 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
    243 /// @param ppDsts   - Array of destination pointers.  Each pointer is
    244 ///                   to a single row of at most 16B.
    245 /// @tparam NumDests - Number of destination pointers.  Each pair of
    246 ///                    pointers is for a 16-byte column of two rows.
    247 //////////////////////////////////////////////////////////////////////////
    248 template <>
    249 struct StorePixels<64, 4>
    250 {
    251     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    252     {
    253         // Each 4-pixel row is 32 bytes.
    254         const __m128i* pPixSrc = (const __m128i*)pSrc;
    255 
    256         // order of pointers match SWR-Z layout
    257         __m128i** pvDsts = (__m128i**)&ppDsts[0];
    258         *pvDsts[0] = pPixSrc[0];
    259         *pvDsts[1] = pPixSrc[1];
    260         *pvDsts[2] = pPixSrc[2];
    261         *pvDsts[3] = pPixSrc[3];
    262     }
    263 };
    264 
    265 #if USE_8x2_TILE_BACKEND
    266 template <>
    267 struct StorePixels<64, 8>
    268 {
    269     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
    270     {
    271         // 8 x 16 bytes = 128 bytes, 16 pixels
    272         const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
    273 
    274         __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
    275 
    276         // order of pointers match SWR-Z layout
    277         *ppDsts128[0] = pSrc128[0];     // 0 1
    278         *ppDsts128[1] = pSrc128[1];     // 2 3
    279         *ppDsts128[2] = pSrc128[2];     // 4 5
    280         *ppDsts128[3] = pSrc128[3];     // 6 7
    281         *ppDsts128[4] = pSrc128[4];     // 8 9
    282         *ppDsts128[5] = pSrc128[5];     // A B
    283         *ppDsts128[6] = pSrc128[6];     // C D
    284         *ppDsts128[7] = pSrc128[7];     // E F
    285     }
    286 };
    287 
    288 #endif
    289 //////////////////////////////////////////////////////////////////////////
    290 /// StorePixels (32-bit pixel specialization)
    291 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
    292 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
    293 /// @param ppDsts   - Array of destination pointers.  Each pointer is
    294 ///                   to a single row of at most 16B.
    295 /// @tparam NumDests - Number of destination pointers.  Each pair of
    296 ///                    pointers is for a 16-byte column of two rows.
    297 //////////////////////////////////////////////////////////////////////////
    298 template <>
    299 struct StorePixels<128, 8>
    300 {
    301     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
    302     {
    303         // Each 4-pixel row is 64 bytes.
    304         const __m128i* pPixSrc = (const __m128i*)pSrc;
    305 
    306         // Unswizzle from SWR-Z order
    307         __m128i** pvDsts = (__m128i**)&ppDsts[0];
    308         *pvDsts[0] = pPixSrc[0];
    309         *pvDsts[1] = pPixSrc[2];
    310         *pvDsts[2] = pPixSrc[1];
    311         *pvDsts[3] = pPixSrc[3];
    312         *pvDsts[4] = pPixSrc[4];
    313         *pvDsts[5] = pPixSrc[6];
    314         *pvDsts[6] = pPixSrc[5];
    315         *pvDsts[7] = pPixSrc[7];
    316     }
    317 };
    318 
    319 #if USE_8x2_TILE_BACKEND
    320 template <>
    321 struct StorePixels<128, 16>
    322 {
    323     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
    324     {
    325         // 16 x 16 bytes = 256 bytes, 16 pixels
    326         const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
    327 
    328         __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
    329 
    330         for (uint32_t i = 0; i < 16; i += 4)
    331         {
    332             *ppDsts128[i + 0] = pSrc128[i + 0];
    333             *ppDsts128[i + 1] = pSrc128[i + 2];
    334             *ppDsts128[i + 2] = pSrc128[i + 1];
    335             *ppDsts128[i + 3] = pSrc128[i + 3];
    336         }
    337     }
    338 };
    339 
    340 #endif
    341 //////////////////////////////////////////////////////////////////////////
    342 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
    343 //////////////////////////////////////////////////////////////////////////
    344 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
    345 struct ConvertPixelsSOAtoAOS
    346 {
    347     //////////////////////////////////////////////////////////////////////////
    348     /// @brief Converts a SIMD from the Hot Tile to the destination format
    349     ///        and converts from SOA to AOS.
    350     /// @param pSrc - Pointer to raster tile.
    351     /// @param pDst - Pointer to destination surface or deswizzling buffer.
    352     template <size_t NumDests>
    353     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    354     {
    355 #if USE_8x2_TILE_BACKEND
    356         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
    357 
    358         OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
    359         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    360 
    361         // Convert from SrcFormat --> DstFormat
    362         simd16vector src;
    363         LoadSOA<SrcFormat>(pSrc, src);
    364         StoreSOA<DstFormat>(src, soaTile);
    365 
    366         // Convert from SOA --> AOS
    367         FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
    368 
    369 #else
    370         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
    371 
    372         OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
    373         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    374 
    375         // Convert from SrcFormat --> DstFormat
    376         simdvector src;
    377         LoadSOA<SrcFormat>(pSrc, src);
    378         StoreSOA<DstFormat>(src, soaTile);
    379 
    380         // Convert from SOA --> AOS
    381         FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
    382 
    383 #endif
    384         // Store data into destination
    385         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
    386     }
    387 };
    388 
    389 //////////////////////////////////////////////////////////////////////////
    390 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
    391 /// Specialization for no format conversion
    392 //////////////////////////////////////////////////////////////////////////
    393 template<SWR_FORMAT Format>
    394 struct ConvertPixelsSOAtoAOS<Format, Format>
    395 {
    396     //////////////////////////////////////////////////////////////////////////
    397     /// @brief Converts a SIMD from the Hot Tile to the destination format
    398     ///        and converts from SOA to AOS.
    399     /// @param pSrc - Pointer to raster tile.
    400     /// @param pDst - Pointer to destination surface or deswizzling buffer.
    401     template <size_t NumDests>
    402     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    403     {
    404 #if USE_8x2_TILE_BACKEND
    405         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
    406 
    407         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    408 
    409         // Convert from SOA --> AOS
    410         FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
    411 
    412 #else
    413         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
    414 
    415         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    416 
    417         // Convert from SOA --> AOS
    418         FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
    419 
    420 #endif
    421         // Store data into destination
    422         StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
    423     }
    424 };
    425 
    426 //////////////////////////////////////////////////////////////////////////
    427 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
    428 //////////////////////////////////////////////////////////////////////////
    429 template<>
    430 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
    431 {
    432     //////////////////////////////////////////////////////////////////////////
    433     /// @brief Converts a SIMD from the Hot Tile to the destination format
    434     ///        and converts from SOA to AOS.
    435     /// @param pSrc - Pointer to raster tile.
    436     /// @param pDst - Pointer to destination surface or deswizzling buffer.
    437     template <size_t NumDests>
    438     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    439     {
    440 #if USE_8x2_TILE_BACKEND
    441         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
    442         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
    443 
    444         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
    445 
    446         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    447 
    448         // Load hot-tile
    449         simd16vector src, dst;
    450         LoadSOA<SrcFormat>(pSrc, src);
    451 
    452         // deswizzle
    453         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
    454         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
    455         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
    456 
    457         // clamp
    458         dst.x = Clamp<DstFormat>(dst.x, 0);
    459         dst.y = Clamp<DstFormat>(dst.y, 1);
    460         dst.z = Clamp<DstFormat>(dst.z, 2);
    461 
    462         // normalize
    463         dst.x = Normalize<DstFormat>(dst.x, 0);
    464         dst.y = Normalize<DstFormat>(dst.y, 1);
    465         dst.z = Normalize<DstFormat>(dst.z, 2);
    466 
    467         // pack
    468         simd16scalari packed = _simd16_castps_si(dst.x);
    469 
    470         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
    471         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
    472 
    473         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
    474         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
    475 
    476         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
    477         uint32_t *pPacked = (uint32_t*)&packed;
    478         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
    479         for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
    480         {
    481             *pAosTile++ = *pPacked++;
    482         }
    483 
    484 #else
    485         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
    486         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
    487         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
    488 
    489         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    490 
    491         // Load hot-tile
    492         simdvector src, dst;
    493         LoadSOA<SrcFormat>(pSrc, src);
    494 
    495         // deswizzle
    496         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
    497         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
    498         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
    499 
    500         // clamp
    501         dst.x = Clamp<DstFormat>(dst.x, 0);
    502         dst.y = Clamp<DstFormat>(dst.y, 1);
    503         dst.z = Clamp<DstFormat>(dst.z, 2);
    504 
    505         // normalize
    506         dst.x = Normalize<DstFormat>(dst.x, 0);
    507         dst.y = Normalize<DstFormat>(dst.y, 1);
    508         dst.z = Normalize<DstFormat>(dst.z, 2);
    509 
    510         // pack
    511         simdscalari packed = _simd_castps_si(dst.x);
    512         packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
    513         packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
    514                                                                               FormatTraits<DstFormat>::GetBPC(1)));
    515 
    516         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
    517         uint32_t *pPacked = (uint32_t*)&packed;
    518         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
    519         for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
    520         {
    521             *pAosTile++ = *pPacked++;
    522         }
    523 
    524 #endif
    525         // Store data into destination
    526         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
    527     }
    528 };
    529 
    530 //////////////////////////////////////////////////////////////////////////
    531 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
    532 //////////////////////////////////////////////////////////////////////////
    533 template<>
    534 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
    535 {
    536     static const SWR_FORMAT SrcFormat = R32_FLOAT;
    537     static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
    538 
    539     //////////////////////////////////////////////////////////////////////////
    540     /// @brief Converts a SIMD from the Hot Tile to the destination format
    541     ///        and converts from SOA to AOS.
    542     /// @param pSrc - Pointer to raster tile.
    543     /// @param pDst - Pointer to destination surface or deswizzling buffer.
    544     template <size_t NumDests>
    545     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    546     {
    547 #if USE_8x2_TILE_BACKEND
    548         simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
    549 
    550         // clamp
    551         const simd16scalar zero = _simd16_setzero_ps();
    552         const simd16scalar ones = _simd16_set1_ps(1.0f);
    553 
    554         comp = _simd16_max_ps(comp, zero);
    555         comp = _simd16_min_ps(comp, ones);
    556 
    557         // normalize
    558         comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    559 
    560         simd16scalari temp = _simd16_cvtps_epi32(comp);
    561 
    562         // swizzle
    563         temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
    564 
    565         // merge/store data into destination but don't overwrite the X8 bits
    566         simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
    567         simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
    568 
    569         simd16scalari dest = _simd16_setzero_si();
    570 
    571         dest = _simd16_insert_si(dest, destlo, 0);
    572         dest = _simd16_insert_si(dest, desthi, 1);
    573 
    574         simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
    575 
    576         dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
    577 
    578         _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
    579         _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
    580 #else
    581         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
    582 
    583         OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
    584         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    585 
    586         // Convert from SrcFormat --> DstFormat
    587         simdvector src;
    588         LoadSOA<SrcFormat>(pSrc, src);
    589         StoreSOA<DstFormat>(src, soaTile);
    590 
    591         // Convert from SOA --> AOS
    592         FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
    593 
    594         // Store data into destination but don't overwrite the X8 bits
    595         // Each 4-pixel row is 16-bytes
    596         __m128i *pZRow01 = (__m128i*)aosTile;
    597         __m128i vQuad00 = _mm_load_si128(pZRow01);
    598         __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
    599 
    600         __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
    601         __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
    602 
    603         __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
    604         __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
    605 
    606         __m128i vMask = _mm_set1_epi32(0xFFFFFF);
    607 
    608         vDst0 = _mm_andnot_si128(vMask, vDst0);
    609         vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
    610         vDst1 = _mm_andnot_si128(vMask, vDst1);
    611         vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
    612 
    613         _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
    614         _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
    615 #endif
    616     }
    617 };
    618 
    619 #if USE_8x2_TILE_BACKEND
    620 template<SWR_FORMAT DstFormat>
    621 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
    622 {
    623     // swizzle rgba -> bgra while we load
    624     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
    625     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
    626     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
    627     simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
    628 
    629     // clamp
    630     const simd16scalar zero = _simd16_setzero_ps();
    631     const simd16scalar ones = _simd16_set1_ps(1.0f);
    632 
    633     comp0 = _simd16_max_ps(comp0, zero);
    634     comp0 = _simd16_min_ps(comp0, ones);
    635 
    636     comp1 = _simd16_max_ps(comp1, zero);
    637     comp1 = _simd16_min_ps(comp1, ones);
    638 
    639     comp2 = _simd16_max_ps(comp2, zero);
    640     comp2 = _simd16_min_ps(comp2, ones);
    641 
    642     comp3 = _simd16_max_ps(comp3, zero);
    643     comp3 = _simd16_min_ps(comp3, ones);
    644 
    645     // gamma-correct only rgb
    646     if (FormatTraits<DstFormat>::isSRGB)
    647     {
    648         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
    649         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
    650         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
    651     }
    652 
    653     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
    654     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    655     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
    656     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
    657     comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
    658 
    659     // moving to 16 wide integer vector types
    660     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
    661     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
    662     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
    663     simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
    664 
    665     // SOA to AOS conversion
    666     src1 = _simd16_slli_epi32(src1,  8);
    667     src2 = _simd16_slli_epi32(src2, 16);
    668     src3 = _simd16_slli_epi32(src3, 24);
    669 
    670     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
    671 
    672     // de-swizzle conversion
    673 #if 1
    674     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
    675     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
    676 
    677     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
    678 
    679 #else
    680     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
    681 
    682 #endif
    683     // store 8x2 memory order:
    684     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
    685     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
    686     _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
    687     _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
    688 }
    689 
    690 #endif
    691 template<SWR_FORMAT DstFormat>
    692 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
    693 {
    694     static const uint32_t offset = sizeof(simdscalar);
    695 
    696     // swizzle rgba -> bgra while we load
    697     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
    698     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
    699     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
    700     simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
    701 
    702     // clamp
    703     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
    704     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
    705 
    706     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
    707     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
    708 
    709     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
    710     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
    711 
    712     vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
    713     vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
    714 
    715     if (FormatTraits<DstFormat>::isSRGB)
    716     {
    717         // Gamma-correct only rgb
    718         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
    719         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
    720         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
    721     }
    722 
    723     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
    724     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    725     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
    726     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
    727     vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
    728 
    729     // moving to 8 wide integer vector types
    730     __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
    731     __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
    732     __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
    733     __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
    734 
    735 #if KNOB_ARCH == KNOB_ARCH_AVX
    736 
    737     // splitting into two sets of 4 wide integer vector types
    738     // because AVX doesn't have instructions to support this operation at 8 wide
    739     __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
    740     __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
    741     __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
    742     __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
    743 
    744     __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
    745     __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
    746     __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
    747     __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
    748 
    749     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
    750     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
    751     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
    752     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
    753     srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
    754     srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
    755 
    756     srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
    757     srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
    758 
    759     srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
    760     srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
    761 
    762     srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
    763     srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
    764 
    765     // unpack into rows that get the tiling order correct
    766     __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
    767     __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
    768 
    769     __m256i final = _mm256_castsi128_si256(vRow00);
    770     final = _mm256_insertf128_si256(final, vRow10, 1);
    771 
    772 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
    773 
    774     // logic is as above, only wider
    775     src1 = _mm256_slli_si256(src1, 1);
    776     src2 = _mm256_slli_si256(src2, 2);
    777     src3 = _mm256_slli_si256(src3, 3);
    778 
    779     src0 = _mm256_or_si256(src0, src1);
    780     src2 = _mm256_or_si256(src2, src3);
    781 
    782     __m256i final = _mm256_or_si256(src0, src2);
    783 #if 0
    784 
    785     __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
    786 
    787     final = _mm256_permutevar8x32_epi32(final, perm);
    788 #else
    789 
    790     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
    791     final = _mm256_permute4x64_epi64(final, 0xD8);
    792 #endif
    793 #endif
    794 
    795     _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
    796 }
    797 
    798 #if USE_8x2_TILE_BACKEND
    799 template<SWR_FORMAT DstFormat>
    800 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
    801 {
    802     // swizzle rgba -> bgra while we load
    803     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
    804     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
    805     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
    806 
    807     // clamp
    808     const simd16scalar zero = _simd16_setzero_ps();
    809     const simd16scalar ones = _simd16_set1_ps(1.0f);
    810 
    811     comp0 = _simd16_max_ps(comp0, zero);
    812     comp0 = _simd16_min_ps(comp0, ones);
    813 
    814     comp1 = _simd16_max_ps(comp1, zero);
    815     comp1 = _simd16_min_ps(comp1, ones);
    816 
    817     comp2 = _simd16_max_ps(comp2, zero);
    818     comp2 = _simd16_min_ps(comp2, ones);
    819 
    820     // gamma-correct only rgb
    821     if (FormatTraits<DstFormat>::isSRGB)
    822     {
    823         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
    824         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
    825         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
    826     }
    827 
    828     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
    829     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    830     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
    831     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
    832 
    833     // moving to 16 wide integer vector types
    834     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
    835     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
    836     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
    837 
    838     // SOA to AOS conversion
    839     src1 = _simd16_slli_epi32(src1,  8);
    840     src2 = _simd16_slli_epi32(src2, 16);
    841 
    842     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2);                       // 0 1 2 3 4 5 6 7 8 9 A B C D E F
    843 
    844     // de-swizzle conversion
    845 #if 1
    846     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
    847     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
    848 
    849     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
    850 
    851 #else
    852     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
    853 
    854 #endif
    855     // store 8x2 memory order:
    856     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
    857     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
    858     _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
    859     _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
    860 }
    861 
    862 #endif
    863 template<SWR_FORMAT DstFormat>
    864 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
    865 {
    866     static const uint32_t offset = sizeof(simdscalar);
    867 
    868     // swizzle rgba -> bgra while we load
    869     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
    870     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
    871     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
    872                                                                                                             // clamp
    873     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
    874     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
    875 
    876     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
    877     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
    878 
    879     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
    880     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
    881 
    882     if (FormatTraits<DstFormat>::isSRGB)
    883     {
    884         // Gamma-correct only rgb
    885         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
    886         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
    887         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
    888     }
    889 
    890     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
    891     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    892     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
    893     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
    894 
    895     // moving to 8 wide integer vector types
    896     __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
    897     __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
    898     __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
    899 
    900 #if KNOB_ARCH == KNOB_ARCH_AVX
    901 
    902     // splitting into two sets of 4 wide integer vector types
    903     // because AVX doesn't have instructions to support this operation at 8 wide
    904     __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
    905     __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
    906     __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
    907 
    908     __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
    909     __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
    910     __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
    911 
    912     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
    913     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
    914     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
    915     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
    916 
    917     srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
    918 
    919     srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
    920 
    921     srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
    922     srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
    923 
    924     // unpack into rows that get the tiling order correct
    925     __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
    926     __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
    927 
    928     __m256i final = _mm256_castsi128_si256(vRow00);
    929     final = _mm256_insertf128_si256(final, vRow10, 1);
    930 
    931 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
    932 
    933                                               // logic is as above, only wider
    934     src1 = _mm256_slli_si256(src1, 1);
    935     src2 = _mm256_slli_si256(src2, 2);
    936 
    937     src0 = _mm256_or_si256(src0, src1);
    938 
    939     __m256i final = _mm256_or_si256(src0, src2);
    940 
    941     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
    942     final = _mm256_permute4x64_epi64(final, 0xD8);
    943 
    944 #endif
    945 
    946     _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
    947 }
    948 
    949 template<>
    950 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
    951 {
    952     template <size_t NumDests>
    953     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    954     {
    955 #if USE_8x2_TILE_BACKEND
    956         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
    957 #else
    958         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
    959 #endif
    960     }
    961 };
    962 
    963 template<>
    964 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
    965 {
    966     template <size_t NumDests>
    967     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    968     {
    969 #if USE_8x2_TILE_BACKEND
    970         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
    971 #else
    972         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
    973 #endif
    974     }
    975 };
    976 
    977 template<>
    978 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
    979 {
    980     template <size_t NumDests>
    981     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    982     {
    983 #if USE_8x2_TILE_BACKEND
    984         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
    985 #else
    986         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
    987 #endif
    988     }
    989 };
    990 
    991 template<>
    992 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
    993 {
    994     template <size_t NumDests>
    995     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    996     {
    997 #if USE_8x2_TILE_BACKEND
    998         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
    999 #else
   1000         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
   1001 #endif
   1002     }
   1003 };
   1004 
   1005 template<>
   1006 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
   1007 {
   1008     template <size_t NumDests>
   1009     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
   1010     {
   1011 #if USE_8x2_TILE_BACKEND
   1012         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
   1013 #else
   1014         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
   1015 #endif
   1016     }
   1017 };
   1018 
   1019 template<>
   1020 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
   1021 {
   1022     template <size_t NumDests>
   1023     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
   1024     {
   1025 #if USE_8x2_TILE_BACKEND
   1026         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
   1027 #else
   1028         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
   1029 #endif
   1030     }
   1031 };
   1032 
   1033 template<>
   1034 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
   1035 {
   1036     template <size_t NumDests>
   1037     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
   1038     {
   1039 #if USE_8x2_TILE_BACKEND
   1040         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
   1041 #else
   1042         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
   1043 #endif
   1044     }
   1045 };
   1046 
   1047 template<>
   1048 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
   1049 {
   1050     template <size_t NumDests>
   1051     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
   1052     {
   1053 #if USE_8x2_TILE_BACKEND
   1054         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
   1055 #else
   1056         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
   1057 #endif
   1058     }
   1059 };
   1060 
   1061 //////////////////////////////////////////////////////////////////////////
   1062 /// StoreRasterTile
   1063 //////////////////////////////////////////////////////////////////////////
   1064 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1065 struct StoreRasterTile
   1066 {
   1067     //////////////////////////////////////////////////////////////////////////
   1068     /// @brief Retrieve color from hot tile source which is always float.
   1069     /// @param pSrc - Pointer to raster tile.
   1070     /// @param x, y - Coordinates to raster tile.
   1071     /// @param output - output color
   1072     INLINE static void GetSwizzledSrcColor(
   1073         uint8_t* pSrc,
   1074         uint32_t x, uint32_t y,
   1075         float outputColor[4])
   1076     {
   1077 #if USE_8x2_TILE_BACKEND
   1078         typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
   1079 
   1080         SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
   1081 
   1082         // Compute which simd tile we're accessing within 8x8 tile.
   1083         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
   1084         uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
   1085 
   1086         SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
   1087 
   1088         uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
   1089 
   1090         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
   1091 #else
   1092         typedef SimdTile<SrcFormat, DstFormat> SimdT;
   1093 
   1094         SimdT* pSrcSimdTiles = (SimdT*)pSrc;
   1095 
   1096         // Compute which simd tile we're accessing within 8x8 tile.
   1097         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
   1098         uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
   1099 
   1100         SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
   1101 
   1102         uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
   1103 
   1104         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
   1105 #endif
   1106     }
   1107 
   1108     //////////////////////////////////////////////////////////////////////////
   1109     /// @brief Stores an 8x8 raster tile to the destination surface.
   1110     /// @param pSrc - Pointer to raster tile.
   1111     /// @param pDstSurface - Destination surface state
   1112     /// @param x, y - Coordinates to raster tile.
   1113     INLINE static void Store(
   1114         uint8_t *pSrc,
   1115         SWR_SURFACE_STATE* pDstSurface,
   1116         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
   1117     {
   1118         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1119         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1120 
   1121         // For each raster tile pixel (rx, ry)
   1122         for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
   1123         {
   1124             for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
   1125             {
   1126                 // Perform bounds checking.
   1127                 if (((x + rx) < lodWidth) &&
   1128                     ((y + ry) < lodHeight))
   1129                 {
   1130                     float srcColor[4];
   1131                     GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
   1132 
   1133                     uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
   1134                         pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1135                         sampleNum, pDstSurface->lod, pDstSurface);
   1136                     {
   1137                         ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
   1138                     }
   1139                 }
   1140             }
   1141         }
   1142     }
   1143 };
   1144 
   1145 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1146 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
   1147 {};
   1148 
   1149 //////////////////////////////////////////////////////////////////////////
   1150 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
   1151 //////////////////////////////////////////////////////////////////////////
   1152 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1153 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
   1154 {
   1155     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
   1156     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1157     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1158 
   1159     //////////////////////////////////////////////////////////////////////////
   1160     /// @brief Stores an 8x8 raster tile to the destination surface.
   1161     /// @param pSrc - Pointer to raster tile.
   1162     /// @param pDstSurface - Destination surface state
   1163     /// @param x, y - Coordinates to raster tile.
   1164     INLINE static void Store(
   1165         uint8_t *pSrc,
   1166         SWR_SURFACE_STATE* pDstSurface,
   1167         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1168     {
   1169         // Punt non-full tiles to generic store
   1170         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1171         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1172 
   1173         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1174         {
   1175             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1176         }
   1177 
   1178         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1179             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1180 #if USE_8x2_TILE_BACKEND
   1181 
   1182         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1183         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1184 
   1185         uint8_t* ppDsts[] =
   1186         {
   1187             pDst,                                           // row 0, col 0
   1188             pDst + pDstSurface->pitch,                      // row 1, col 0
   1189             pDst + dx / 2,                                  // row 0, col 1
   1190             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
   1191         };
   1192 
   1193         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1194         {
   1195             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
   1196             {
   1197                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1198 
   1199                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1200 
   1201                 ppDsts[0] += dx;
   1202                 ppDsts[1] += dx;
   1203                 ppDsts[2] += dx;
   1204                 ppDsts[3] += dx;
   1205             }
   1206 
   1207             ppDsts[0] += dy;
   1208             ppDsts[1] += dy;
   1209             ppDsts[2] += dy;
   1210             ppDsts[3] += dy;
   1211         }
   1212 #else
   1213         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
   1214 
   1215         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1216         {
   1217             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
   1218 
   1219             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1220             {
   1221                 // Format conversion and convert from SOA to AOS, and store the rows.
   1222                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
   1223 
   1224                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1225                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1226                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
   1227             }
   1228 
   1229             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
   1230             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
   1231         }
   1232 #endif
   1233     }
   1234 };
   1235 
   1236 //////////////////////////////////////////////////////////////////////////
   1237 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
   1238 //////////////////////////////////////////////////////////////////////////
   1239 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1240 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
   1241 {
   1242     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
   1243     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1244     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1245 
   1246     //////////////////////////////////////////////////////////////////////////
   1247     /// @brief Stores an 8x8 raster tile to the destination surface.
   1248     /// @param pSrc - Pointer to raster tile.
   1249     /// @param pDstSurface - Destination surface state
   1250     /// @param x, y - Coordinates to raster tile.
   1251     INLINE static void Store(
   1252         uint8_t *pSrc,
   1253         SWR_SURFACE_STATE* pDstSurface,
   1254         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1255     {
   1256         // Punt non-full tiles to generic store
   1257         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1258         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1259 
   1260         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1261         {
   1262             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1263         }
   1264 
   1265         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1266             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1267 #if USE_8x2_TILE_BACKEND
   1268 
   1269         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1270         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1271 
   1272         uint8_t* ppDsts[] =
   1273         {
   1274             pDst,                                           // row 0, col 0
   1275             pDst + pDstSurface->pitch,                      // row 1, col 0
   1276             pDst + dx / 2,                                  // row 0, col 1
   1277             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
   1278         };
   1279 
   1280         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1281         {
   1282             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
   1283             {
   1284                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1285 
   1286                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1287 
   1288                 ppDsts[0] += dx;
   1289                 ppDsts[1] += dx;
   1290                 ppDsts[2] += dx;
   1291                 ppDsts[3] += dx;
   1292             }
   1293 
   1294             ppDsts[0] += dy;
   1295             ppDsts[1] += dy;
   1296             ppDsts[2] += dy;
   1297             ppDsts[3] += dy;
   1298         }
   1299 #else
   1300         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
   1301 
   1302         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1303         {
   1304             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
   1305 
   1306             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1307             {
   1308                 // Format conversion and convert from SOA to AOS, and store the rows.
   1309                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
   1310 
   1311                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1312                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1313                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
   1314             }
   1315 
   1316             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
   1317             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
   1318         }
   1319 #endif
   1320     }
   1321 };
   1322 
   1323 //////////////////////////////////////////////////////////////////////////
   1324 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
   1325 //////////////////////////////////////////////////////////////////////////
   1326 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1327 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
   1328 {
   1329     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
   1330     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1331     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1332 
   1333     //////////////////////////////////////////////////////////////////////////
   1334     /// @brief Stores an 8x8 raster tile to the destination surface.
   1335     /// @param pSrc - Pointer to raster tile.
   1336     /// @param pDstSurface - Destination surface state
   1337     /// @param x, y - Coordinates to raster tile.
   1338     INLINE static void Store(
   1339         uint8_t *pSrc,
   1340         SWR_SURFACE_STATE* pDstSurface,
   1341         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1342     {
   1343         // Punt non-full tiles to generic store
   1344         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1345         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1346 
   1347         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1348         {
   1349             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1350         }
   1351 
   1352         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1353             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1354 #if USE_8x2_TILE_BACKEND
   1355 
   1356         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1357         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1358 
   1359         uint8_t* ppDsts[] =
   1360         {
   1361             pDst,                                           // row 0, col 0
   1362             pDst + pDstSurface->pitch,                      // row 1, col 0
   1363             pDst + dx / 2,                                  // row 0, col 1
   1364             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
   1365         };
   1366 
   1367         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1368         {
   1369             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
   1370             {
   1371                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1372 
   1373                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1374 
   1375                 ppDsts[0] += dx;
   1376                 ppDsts[1] += dx;
   1377                 ppDsts[2] += dx;
   1378                 ppDsts[3] += dx;
   1379             }
   1380 
   1381             ppDsts[0] += dy;
   1382             ppDsts[1] += dy;
   1383             ppDsts[2] += dy;
   1384             ppDsts[3] += dy;
   1385         }
   1386 #else
   1387         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
   1388 
   1389         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1390         {
   1391             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
   1392 
   1393             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1394             {
   1395                 // Format conversion and convert from SOA to AOS, and store the rows.
   1396                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
   1397 
   1398                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1399                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1400                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
   1401             }
   1402 
   1403             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
   1404             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
   1405         }
   1406 #endif
   1407     }
   1408 };
   1409 
   1410 //////////////////////////////////////////////////////////////////////////
   1411 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
   1412 //////////////////////////////////////////////////////////////////////////
   1413 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1414 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
   1415 {
   1416     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
   1417     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1418     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1419     static const size_t MAX_DST_COLUMN_BYTES = 16;
   1420 #if !USE_8x2_TILE_BACKEND
   1421     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
   1422     static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1423 #endif
   1424 
   1425     //////////////////////////////////////////////////////////////////////////
   1426     /// @brief Stores an 8x8 raster tile to the destination surface.
   1427     /// @param pSrc - Pointer to raster tile.
   1428     /// @param pDstSurface - Destination surface state
   1429     /// @param x, y - Coordinates to raster tile.
   1430     INLINE static void Store(
   1431         uint8_t *pSrc,
   1432         SWR_SURFACE_STATE* pDstSurface,
   1433         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1434     {
   1435         // Punt non-full tiles to generic store
   1436         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1437         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1438 
   1439         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1440         {
   1441             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1442         }
   1443 
   1444         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1445             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1446 #if USE_8x2_TILE_BACKEND
   1447 
   1448         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1449         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
   1450 
   1451         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   1452         static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
   1453 
   1454         uint8_t *ppDsts[] =
   1455         {
   1456             pDst,                                                               // row 0, col 0
   1457             pDst + pDstSurface->pitch,                                          // row 1, col 0
   1458             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
   1459             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
   1460             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
   1461             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
   1462             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
   1463             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3                // row 1, col 3
   1464         };
   1465 
   1466         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1467         {
   1468             // Raster tile width is same as simd16 tile width
   1469             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1470 
   1471             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1472 
   1473             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1474 
   1475             for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
   1476             {
   1477                 ppDsts[i] += dy;
   1478             }
   1479         }
   1480 #else
   1481         uint8_t* ppDsts[] =
   1482         {
   1483             pDst,                                               // row 0, col 0
   1484             pDst + pDstSurface->pitch,                          // row 1, col 0
   1485             pDst + MAX_DST_COLUMN_BYTES,                        // row 0, col 1
   1486             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,   // row 1, col 1
   1487         };
   1488 
   1489         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1490         {
   1491             uint8_t* ppStartRows[] =
   1492             {
   1493                 ppDsts[0],
   1494                 ppDsts[1],
   1495                 ppDsts[2],
   1496                 ppDsts[3],
   1497             };
   1498 
   1499             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1500             {
   1501                 // Format conversion and convert from SOA to AOS, and store the rows.
   1502                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1503 
   1504                 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
   1505                 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
   1506                 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
   1507                 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
   1508                 pSrc += SRC_COLUMN_BYTES;
   1509             }
   1510 
   1511             ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
   1512             ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
   1513             ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
   1514             ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
   1515         }
   1516 #endif
   1517     }
   1518 };
   1519 
   1520 //////////////////////////////////////////////////////////////////////////
   1521 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
   1522 //////////////////////////////////////////////////////////////////////////
   1523 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1524 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
   1525 {
   1526     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
   1527     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1528     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1529     static const size_t MAX_DST_COLUMN_BYTES = 16;
   1530 #if !USE_8x2_TILE_BACKEND
   1531     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
   1532     static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1533 #endif
   1534 
   1535     //////////////////////////////////////////////////////////////////////////
   1536     /// @brief Stores an 8x8 raster tile to the destination surface.
   1537     /// @param pSrc - Pointer to raster tile.
   1538     /// @param pDstSurface - Destination surface state
   1539     /// @param x, y - Coordinates to raster tile.
   1540     INLINE static void Store(
   1541         uint8_t *pSrc,
   1542         SWR_SURFACE_STATE* pDstSurface,
   1543         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1544     {
   1545         // Punt non-full tiles to generic store
   1546         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1547         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1548 
   1549         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1550         {
   1551             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1552         }
   1553 
   1554         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1555             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1556 #if USE_8x2_TILE_BACKEND
   1557 
   1558         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1559         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
   1560 
   1561         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   1562         static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
   1563 
   1564         uint8_t* ppDsts[] =
   1565         {
   1566             pDst,                                                               // row 0, col 0
   1567             pDst + pDstSurface->pitch,                                          // row 1, col 0
   1568             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
   1569             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
   1570             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
   1571             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
   1572             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
   1573             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,               // row 1, col 3
   1574             pDst + MAX_DST_COLUMN_BYTES * 4,                                    // row 0, col 4
   1575             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,               // row 1, col 4
   1576             pDst + MAX_DST_COLUMN_BYTES * 5,                                    // row 0, col 5
   1577             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,               // row 1, col 5
   1578             pDst + MAX_DST_COLUMN_BYTES * 6,                                    // row 0, col 6
   1579             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,               // row 1, col 6
   1580             pDst + MAX_DST_COLUMN_BYTES * 7,                                    // row 0, col 7
   1581             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,               // row 1, col 7
   1582         };
   1583 
   1584         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1585         {
   1586             // Raster tile width is same as simd16 tile width
   1587             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1588 
   1589             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1590 
   1591             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1592 
   1593             for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
   1594             {
   1595                 ppDsts[i] += dy;
   1596             }
   1597         }
   1598 #else
   1599         struct DstPtrs
   1600         {
   1601             uint8_t* ppDsts[8];
   1602         } ptrs;
   1603 
   1604         // Need 8 pointers, 4 columns of 2 rows each
   1605         for (uint32_t y = 0; y < 2; ++y)
   1606         {
   1607             for (uint32_t x = 0; x < 4; ++x)
   1608             {
   1609                 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
   1610             }
   1611         }
   1612 
   1613         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1614         {
   1615             DstPtrs startPtrs = ptrs;
   1616 
   1617             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1618             {
   1619                 // Format conversion and convert from SOA to AOS, and store the rows.
   1620                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
   1621 
   1622                 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
   1623                 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
   1624                 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
   1625                 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
   1626                 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
   1627                 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
   1628                 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
   1629                 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
   1630                 pSrc += SRC_COLUMN_BYTES;
   1631             }
   1632 
   1633             ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
   1634             ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
   1635             ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
   1636             ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
   1637             ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
   1638             ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
   1639             ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
   1640             ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
   1641         }
   1642 #endif
   1643     }
   1644 };
   1645 
   1646 //////////////////////////////////////////////////////////////////////////
   1647 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
   1648 //////////////////////////////////////////////////////////////////////////
   1649 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1650 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
   1651 {
   1652     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
   1653     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1654 
   1655     //////////////////////////////////////////////////////////////////////////
   1656     /// @brief Stores an 8x8 raster tile to the destination surface.
   1657     /// @param pSrc - Pointer to raster tile.
   1658     /// @param pDstSurface - Destination surface state
   1659     /// @param x, y - Coordinates to raster tile.
   1660     INLINE static void Store(
   1661         uint8_t *pSrc,
   1662         SWR_SURFACE_STATE* pDstSurface,
   1663         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1664     {
   1665         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   1666 
   1667         // Punt non-full tiles to generic store
   1668         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1669         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1670 
   1671         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1672         {
   1673             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1674         }
   1675 
   1676         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   1677         // We can compute the offsets to each column within the raster tile once and increment from these.
   1678 #if USE_8x2_TILE_BACKEND
   1679         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   1680         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1681             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1682 
   1683         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   1684 
   1685         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1686         uint8_t *ppDsts[] =
   1687         {
   1688             pDst,
   1689             pDst + DestRowWidthBytes,
   1690             pDst + DestRowWidthBytes / 4,
   1691             pDst + DestRowWidthBytes + DestRowWidthBytes / 4
   1692         };
   1693 
   1694         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1695         {
   1696             // Raster tile width is same as simd16 tile width
   1697             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1698 
   1699             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1700 
   1701             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1702 
   1703             ppDsts[0] += dy;
   1704             ppDsts[1] += dy;
   1705             ppDsts[2] += dy;
   1706             ppDsts[3] += dy;
   1707         }
   1708 #else
   1709         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   1710         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1711             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1712 
   1713         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   1714         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   1715 
   1716         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1717         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   1718         {
   1719             uint32_t rowOffset = row * DestRowWidthBytes;
   1720 
   1721             uint8_t* pRow = pCol0 + rowOffset;
   1722             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
   1723 
   1724             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1725             pSrc += pSrcInc;
   1726 
   1727             ppDsts[0] += DestRowWidthBytes / 4;
   1728             ppDsts[1] += DestRowWidthBytes / 4;
   1729 
   1730             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1731             pSrc += pSrcInc;
   1732         }
   1733 #endif
   1734     }
   1735 };
   1736 
   1737 //////////////////////////////////////////////////////////////////////////
   1738 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
   1739 //////////////////////////////////////////////////////////////////////////
   1740 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1741 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
   1742 {
   1743     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
   1744     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1745 
   1746     //////////////////////////////////////////////////////////////////////////
   1747     /// @brief Stores an 8x8 raster tile to the destination surface.
   1748     /// @param pSrc - Pointer to raster tile.
   1749     /// @param pDstSurface - Destination surface state
   1750     /// @param x, y - Coordinates to raster tile.
   1751     INLINE static void Store(
   1752         uint8_t *pSrc,
   1753         SWR_SURFACE_STATE* pDstSurface,
   1754         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1755     {
   1756         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   1757 
   1758         // Punt non-full tiles to generic store
   1759         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1760         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1761 
   1762         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1763         {
   1764             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1765         }
   1766 
   1767         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   1768         // We can compute the offsets to each column within the raster tile once and increment from these.
   1769 #if USE_8x2_TILE_BACKEND
   1770         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   1771         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1772             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1773 
   1774         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   1775 
   1776         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1777         uint8_t *ppDsts[] =
   1778         {
   1779             pDst,
   1780             pDst + DestRowWidthBytes,
   1781             pDst + DestRowWidthBytes / 2,
   1782             pDst + DestRowWidthBytes + DestRowWidthBytes / 2
   1783         };
   1784 
   1785         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1786         {
   1787             // Raster tile width is same as simd16 tile width
   1788             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1789 
   1790             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1791 
   1792             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1793 
   1794             ppDsts[0] += dy;
   1795             ppDsts[1] += dy;
   1796             ppDsts[2] += dy;
   1797             ppDsts[3] += dy;
   1798         }
   1799 #else
   1800         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   1801         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1802             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1803 
   1804         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   1805         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   1806 
   1807         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1808         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   1809         {
   1810             uint32_t rowOffset = row * DestRowWidthBytes;
   1811 
   1812             uint8_t* pRow = pCol0 + rowOffset;
   1813             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
   1814 
   1815             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1816             pSrc += pSrcInc;
   1817 
   1818             ppDsts[0] += DestRowWidthBytes / 2;
   1819             ppDsts[1] += DestRowWidthBytes / 2;
   1820 
   1821             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1822             pSrc += pSrcInc;
   1823         }
   1824 #endif
   1825     }
   1826 };
   1827 
   1828 //////////////////////////////////////////////////////////////////////////
   1829 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
   1830 //////////////////////////////////////////////////////////////////////////
   1831 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1832 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
   1833 {
   1834     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
   1835     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1836     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1837 
   1838     //////////////////////////////////////////////////////////////////////////
   1839     /// @brief Stores an 8x8 raster tile to the destination surface.
   1840     /// @param pSrc - Pointer to raster tile.
   1841     /// @param pDstSurface - Destination surface state
   1842     /// @param x, y - Coordinates to raster tile.
   1843     INLINE static void Store(
   1844         uint8_t *pSrc,
   1845         SWR_SURFACE_STATE* pDstSurface,
   1846         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1847     {
   1848         static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
   1849 
   1850         // Punt non-full tiles to generic store
   1851         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1852         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1853 
   1854         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1855         {
   1856             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1857         }
   1858 
   1859         // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
   1860         // We can compute the offsets to each column within the raster tile once and increment from these.
   1861 #if USE_8x2_TILE_BACKEND
   1862         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1863             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1864 
   1865         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1866         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1867 
   1868         uint8_t* ppDsts[] =
   1869         {
   1870             pDst,                                           // row 0, col 0
   1871             pDst + DestRowWidthBytes,                       // row 1, col 0
   1872             pDst + dx / 2,                                  // row 0, col 1
   1873             pDst + DestRowWidthBytes + dx / 2               // row 1, col 1
   1874         };
   1875 
   1876         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1877         {
   1878             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
   1879             {
   1880                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1881 
   1882                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1883 
   1884                 ppDsts[0] += dx;
   1885                 ppDsts[1] += dx;
   1886                 ppDsts[2] += dx;
   1887                 ppDsts[3] += dx;
   1888             }
   1889 
   1890             ppDsts[0] += dy;
   1891             ppDsts[1] += dy;
   1892             ppDsts[2] += dy;
   1893             ppDsts[3] += dy;
   1894         }
   1895 #else
   1896         uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1897             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1898         uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
   1899 
   1900         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   1901         {
   1902             for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
   1903             {
   1904                 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
   1905 
   1906                 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
   1907                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1908 
   1909                 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   1910                 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   1911             }
   1912 
   1913             pRow0 += (DestRowWidthBytes * 2);
   1914             pRow1 += (DestRowWidthBytes * 2);
   1915         }
   1916 #endif
   1917     }
   1918 };
   1919 
   1920 //////////////////////////////////////////////////////////////////////////
   1921 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
   1922 //////////////////////////////////////////////////////////////////////////
   1923 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1924 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
   1925 {
   1926     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
   1927     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1928 
   1929     //////////////////////////////////////////////////////////////////////////
   1930     /// @brief Stores an 8x8 raster tile to the destination surface.
   1931     /// @param pSrc - Pointer to raster tile.
   1932     /// @param pDstSurface - Destination surface state
   1933     /// @param x, y - Coordinates to raster tile.
   1934     INLINE static void Store(
   1935         uint8_t *pSrc,
   1936         SWR_SURFACE_STATE* pDstSurface,
   1937         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1938     {
   1939         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   1940         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
   1941 
   1942         // Punt non-full tiles to generic store
   1943         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1944         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1945 
   1946         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1947         {
   1948             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1949         }
   1950 
   1951         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   1952         // We can compute the offsets to each column within the raster tile once and increment from these.
   1953 #if USE_8x2_TILE_BACKEND
   1954         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   1955         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1956             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1957 
   1958         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   1959         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   1960 
   1961         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1962         uint8_t *ppDsts[] =
   1963         {
   1964             pDst,                                           // row 0, col 0
   1965             pDst + DestRowWidthBytes,                       // row 1, col 0
   1966             pDst + DestColumnBytes,                         // row 0, col 1
   1967             pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
   1968         };
   1969 
   1970         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1971         {
   1972             // Raster tile width is same as simd16 tile width
   1973             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1974 
   1975             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1976 
   1977             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1978 
   1979             ppDsts[0] += dy;
   1980             ppDsts[1] += dy;
   1981             ppDsts[2] += dy;
   1982             ppDsts[3] += dy;
   1983         }
   1984 #else
   1985         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   1986         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1987             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1988 
   1989         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   1990         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   1991 
   1992         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1993         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   1994         {
   1995             uint32_t rowOffset = row * DestRowWidthBytes;
   1996 
   1997             uint8_t* pRow = pCol0 + rowOffset;
   1998             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
   1999 
   2000             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2001             pSrc += pSrcInc;
   2002 
   2003             ppDsts[0] += DestColumnBytes;
   2004             ppDsts[1] += DestColumnBytes;
   2005 
   2006             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2007             pSrc += pSrcInc;
   2008         }
   2009 #endif
   2010     }
   2011 };
   2012 
   2013 //////////////////////////////////////////////////////////////////////////
   2014 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
   2015 //////////////////////////////////////////////////////////////////////////
   2016 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   2017 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
   2018 {
   2019     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
   2020     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   2021 
   2022     //////////////////////////////////////////////////////////////////////////
   2023     /// @brief Stores an 8x8 raster tile to the destination surface.
   2024     /// @param pSrc - Pointer to raster tile.
   2025     /// @param pDstSurface - Destination surface state
   2026     /// @param x, y - Coordinates to raster tile.
   2027     INLINE static void Store(
   2028         uint8_t *pSrc,
   2029         SWR_SURFACE_STATE* pDstSurface,
   2030         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   2031     {
   2032         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   2033         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
   2034 
   2035         // Punt non-full tiles to generic store
   2036         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   2037         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   2038 
   2039         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   2040         {
   2041             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   2042         }
   2043 
   2044         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   2045         // We can compute the offsets to each column within the raster tile once and increment from these.
   2046 #if USE_8x2_TILE_BACKEND
   2047         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   2048         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2049             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2050 
   2051         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   2052         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   2053 
   2054         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   2055         uint8_t *ppDsts[] =
   2056         {
   2057             pDst,                                           // row 0, col 0
   2058             pDst + DestRowWidthBytes,                       // row 1, col 0
   2059             pDst + DestColumnBytes,                         // row 0, col 1
   2060             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
   2061             pDst + DestColumnBytes * 2,                     // row 0, col 2
   2062             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
   2063             pDst + DestColumnBytes * 3,                     // row 0, col 3
   2064             pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
   2065         };
   2066 
   2067         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   2068         {
   2069             // Raster tile width is same as simd16 tile width
   2070             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   2071 
   2072             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2073 
   2074             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   2075 
   2076             for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
   2077             {
   2078                 ppDsts[i] += dy;
   2079             }
   2080         }
   2081 #else
   2082         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   2083         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2084             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2085         uint8_t* pCol1 = pCol0 + DestColumnBytes;
   2086 
   2087         // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
   2088         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   2089         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   2090 
   2091         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   2092         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   2093         {
   2094             uint32_t rowOffset = row * DestRowWidthBytes;
   2095             uint8_t* ppDsts[] =
   2096             {
   2097                 pCol0 + rowOffset,
   2098                 pCol0 + rowOffset + DestRowWidthBytes,
   2099                 pCol1 + rowOffset,
   2100                 pCol1 + rowOffset + DestRowWidthBytes,
   2101             };
   2102 
   2103             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2104             pSrc += pSrcInc;
   2105 
   2106             ppDsts[0] += DestColumnBytes * 2;
   2107             ppDsts[1] += DestColumnBytes * 2;
   2108             ppDsts[2] += DestColumnBytes * 2;
   2109             ppDsts[3] += DestColumnBytes * 2;
   2110 
   2111             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2112             pSrc += pSrcInc;
   2113         }
   2114 #endif
   2115     }
   2116 };
   2117 
   2118 //////////////////////////////////////////////////////////////////////////
   2119 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
   2120 //////////////////////////////////////////////////////////////////////////
   2121 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   2122 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
   2123 {
   2124     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
   2125 #if USE_8x2_TILE_BACKEND
   2126     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   2127 
   2128 #else
   2129     static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
   2130     static const size_t TILE_Y_ROWS = 32;
   2131     static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
   2132 
   2133     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   2134     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   2135     static const size_t MAX_DST_COLUMN_BYTES = 16;
   2136 
   2137     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
   2138     static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
   2139 
   2140 #endif
   2141     //////////////////////////////////////////////////////////////////////////
   2142     /// @brief Stores an 8x8 raster tile to the destination surface.
   2143     /// @param pSrc - Pointer to raster tile.
   2144     /// @param pDstSurface - Destination surface state
   2145     /// @param x, y - Coordinates to raster tile.
   2146     INLINE static void Store(
   2147         uint8_t *pSrc,
   2148         SWR_SURFACE_STATE* pDstSurface,
   2149         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   2150     {
   2151 #if USE_8x2_TILE_BACKEND
   2152         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   2153         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
   2154 #endif
   2155 
   2156         // Punt non-full tiles to generic store
   2157         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   2158         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   2159 
   2160         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   2161         {
   2162             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   2163         }
   2164 
   2165         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   2166         // We can compute the offsets to each column within the raster tile once and increment from these.
   2167 #if USE_8x2_TILE_BACKEND
   2168         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   2169         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2170             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2171 
   2172         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   2173         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   2174 
   2175         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   2176         uint8_t *ppDsts[] =
   2177         {
   2178             pDst,                                           // row 0, col 0
   2179             pDst + DestRowWidthBytes,                       // row 1, col 0
   2180             pDst + DestColumnBytes,                         // row 0, col 1
   2181             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
   2182             pDst + DestColumnBytes * 2,                     // row 0, col 2
   2183             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
   2184             pDst + DestColumnBytes * 3,                     // row 0, col 3
   2185             pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
   2186             pDst + DestColumnBytes * 4,                     // row 0, col 4
   2187             pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
   2188             pDst + DestColumnBytes * 5,                     // row 0, col 5
   2189             pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
   2190             pDst + DestColumnBytes * 6,                     // row 0, col 6
   2191             pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
   2192             pDst + DestColumnBytes * 7,                     // row 0, col 7
   2193             pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
   2194         };
   2195 
   2196         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   2197         {
   2198             // Raster tile width is same as simd16 tile width
   2199             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   2200 
   2201             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2202 
   2203             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   2204 
   2205             for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
   2206             {
   2207                 ppDsts[i] += dy;
   2208             }
   2209         }
   2210 #else
   2211         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   2212         uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2213             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2214         struct DstPtrs
   2215         {
   2216             uint8_t* ppDsts[8];
   2217         } ptrs;
   2218 
   2219         // Need 8 pointers, 4 columns of 2 rows each
   2220         for (uint32_t y = 0; y < 2; ++y)
   2221         {
   2222             for (uint32_t x = 0; x < 4; ++x)
   2223             {
   2224                 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
   2225             }
   2226         }
   2227 
   2228         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   2229         {
   2230             DstPtrs startPtrs = ptrs;
   2231 
   2232             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   2233             {
   2234                 // Format conversion and convert from SOA to AOS, and store the rows.
   2235                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
   2236 
   2237                 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
   2238                 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
   2239                 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
   2240                 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
   2241                 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
   2242                 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
   2243                 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
   2244                 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
   2245                 pSrc += SRC_COLUMN_BYTES;
   2246             }
   2247 
   2248             ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2249             ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2250             ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2251             ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2252             ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2253             ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2254             ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2255             ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2256         }
   2257 #endif
   2258     }
   2259 };
   2260 
   2261 //////////////////////////////////////////////////////////////////////////
   2262 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
   2263 //////////////////////////////////////////////////////////////////////////
   2264 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   2265 struct StoreMacroTile
   2266 {
   2267     //////////////////////////////////////////////////////////////////////////
   2268     /// @brief Stores a macrotile to the destination surface using safe implementation.
   2269     /// @param pSrc - Pointer to macro tile.
   2270     /// @param pDstSurface - Destination surface state
   2271     /// @param x, y - Coordinates to macro tile
   2272     static void StoreGeneric(
   2273         uint8_t *pSrcHotTile,
   2274         SWR_SURFACE_STATE* pDstSurface,
   2275         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
   2276     {
   2277         PFN_STORE_TILES_INTERNAL pfnStore;
   2278         pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
   2279 
   2280         // Store each raster tile from the hot tile to the destination surface.
   2281         for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
   2282         {
   2283             for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
   2284             {
   2285                 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
   2286                 {
   2287                     pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
   2288                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
   2289                 }
   2290             }
   2291         }
   2292 
   2293     }
   2294 
   2295     typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
   2296     //////////////////////////////////////////////////////////////////////////
   2297     /// @brief Stores a macrotile to the destination surface.
   2298     /// @param pSrc - Pointer to macro tile.
   2299     /// @param pDstSurface - Destination surface state
   2300     /// @param x, y - Coordinates to macro tile
   2301     static void Store(
   2302         uint8_t *pSrcHotTile,
   2303         SWR_SURFACE_STATE* pDstSurface,
   2304         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
   2305     {
   2306         PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
   2307 
   2308         for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
   2309         {
   2310             size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
   2311                 0,
   2312                 0,
   2313                 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
   2314                 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
   2315                 sampleNum,
   2316                 pDstSurface->lod,
   2317                 pDstSurface);
   2318 
   2319             // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
   2320             bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
   2321                 (pDstSurface->bInterleavedSamples);
   2322 
   2323             pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
   2324         }
   2325 
   2326         // Store each raster tile from the hot tile to the destination surface.
   2327         for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
   2328         {
   2329             for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
   2330             {
   2331                 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
   2332                 {
   2333                     pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
   2334                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
   2335                 }
   2336             }
   2337         }
   2338     }
   2339 };
   2340 
   2341 //////////////////////////////////////////////////////////////////////////
   2342 /// InitStoreTilesTable - Helper for setting up the tables.
   2343 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
   2344 void InitStoreTilesTableColor_Half1(
   2345     PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
   2346 {
   2347     table[TTileMode][R32G32B32A32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
   2348     table[TTileMode][R32G32B32A32_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
   2349     table[TTileMode][R32G32B32A32_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
   2350     table[TTileMode][R32G32B32X32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
   2351     table[TTileMode][R32G32B32A32_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
   2352     table[TTileMode][R32G32B32A32_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
   2353     table[TTileMode][R32G32B32_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
   2354     table[TTileMode][R32G32B32_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
   2355     table[TTileMode][R32G32B32_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
   2356     table[TTileMode][R32G32B32_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
   2357     table[TTileMode][R32G32B32_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
   2358     table[TTileMode][R16G16B16A16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
   2359     table[TTileMode][R16G16B16A16_SNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
   2360     table[TTileMode][R16G16B16A16_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
   2361     table[TTileMode][R16G16B16A16_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
   2362     table[TTileMode][R16G16B16A16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
   2363     table[TTileMode][R32G32_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
   2364     table[TTileMode][R32G32_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
   2365     table[TTileMode][R32G32_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
   2366     table[TTileMode][R32_FLOAT_X8X24_TYPELESS]      = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
   2367     table[TTileMode][X32_TYPELESS_G8X24_UINT]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
   2368     table[TTileMode][R16G16B16X16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
   2369     table[TTileMode][R16G16B16X16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
   2370     table[TTileMode][R16G16B16A16_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
   2371     table[TTileMode][R16G16B16A16_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
   2372     table[TTileMode][R32G32_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
   2373     table[TTileMode][R32G32_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
   2374     table[TTileMode][B8G8R8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
   2375     table[TTileMode][B8G8R8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
   2376     table[TTileMode][R10G10B10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
   2377     table[TTileMode][R10G10B10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
   2378     table[TTileMode][R10G10B10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
   2379     table[TTileMode][R8G8B8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
   2380     table[TTileMode][R8G8B8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
   2381     table[TTileMode][R8G8B8A8_SNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
   2382     table[TTileMode][R8G8B8A8_SINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
   2383     table[TTileMode][R8G8B8A8_UINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
   2384     table[TTileMode][R16G16_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
   2385     table[TTileMode][R16G16_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
   2386     table[TTileMode][R16G16_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
   2387     table[TTileMode][R16G16_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
   2388     table[TTileMode][R16G16_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
   2389     table[TTileMode][B10G10R10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
   2390     table[TTileMode][B10G10R10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
   2391     table[TTileMode][R11G11B10_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
   2392     table[TTileMode][R10G10B10_FLOAT_A2_UNORM]      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
   2393     table[TTileMode][R32_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
   2394     table[TTileMode][R32_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
   2395     table[TTileMode][R32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
   2396     table[TTileMode][R24_UNORM_X8_TYPELESS]         = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
   2397     table[TTileMode][X24_TYPELESS_G8_UINT]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
   2398     table[TTileMode][A32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
   2399     table[TTileMode][B8G8R8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
   2400     table[TTileMode][B8G8R8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
   2401     table[TTileMode][R8G8B8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
   2402     table[TTileMode][R8G8B8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
   2403 }
   2404 
   2405 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
   2406 void InitStoreTilesTableColor_Half2(
   2407     PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
   2408 {
   2409     table[TTileMode][R9G9B9E5_SHAREDEXP]            = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
   2410     table[TTileMode][B10G10R10X2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
   2411     table[TTileMode][R10G10B10X2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
   2412     table[TTileMode][R8G8B8A8_SSCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
   2413     table[TTileMode][R8G8B8A8_USCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
   2414     table[TTileMode][R16G16_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
   2415     table[TTileMode][R16G16_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
   2416     table[TTileMode][R32_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
   2417     table[TTileMode][R32_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
   2418     table[TTileMode][B5G6R5_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
   2419     table[TTileMode][B5G6R5_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
   2420     table[TTileMode][B5G5R5A1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
   2421     table[TTileMode][B5G5R5A1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
   2422     table[TTileMode][B4G4R4A4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
   2423     table[TTileMode][B4G4R4A4_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
   2424     table[TTileMode][R8G8_UNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
   2425     table[TTileMode][R8G8_SNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
   2426     table[TTileMode][R8G8_SINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
   2427     table[TTileMode][R8G8_UINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
   2428     table[TTileMode][R16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
   2429     table[TTileMode][R16_SNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
   2430     table[TTileMode][R16_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
   2431     table[TTileMode][R16_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
   2432     table[TTileMode][R16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
   2433     table[TTileMode][A16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
   2434     table[TTileMode][A16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
   2435     table[TTileMode][B5G5R5X1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
   2436     table[TTileMode][B5G5R5X1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
   2437     table[TTileMode][R8G8_SSCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
   2438     table[TTileMode][R8G8_USCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
   2439     table[TTileMode][R16_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
   2440     table[TTileMode][R16_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
   2441     table[TTileMode][A1B5G5R5_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
   2442     table[TTileMode][A4B4G4R4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
   2443     table[TTileMode][R8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
   2444     table[TTileMode][R8_SNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
   2445     table[TTileMode][R8_SINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
   2446     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
   2447     table[TTileMode][A8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
   2448     table[TTileMode][R8_SSCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
   2449     table[TTileMode][R8_USCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
   2450     table[TTileMode][R8G8B8_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
   2451     table[TTileMode][R8G8B8_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
   2452     table[TTileMode][R8G8B8_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
   2453     table[TTileMode][R8G8B8_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
   2454     table[TTileMode][R16G16B16_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
   2455     table[TTileMode][R16G16B16_UNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
   2456     table[TTileMode][R16G16B16_SNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
   2457     table[TTileMode][R16G16B16_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
   2458     table[TTileMode][R16G16B16_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
   2459     table[TTileMode][R8G8B8_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
   2460     table[TTileMode][R16G16B16_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
   2461     table[TTileMode][R16G16B16_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
   2462     table[TTileMode][R10G10B10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
   2463     table[TTileMode][R10G10B10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
   2464     table[TTileMode][R10G10B10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
   2465     table[TTileMode][R10G10B10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
   2466     table[TTileMode][B10G10R10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
   2467     table[TTileMode][B10G10R10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
   2468     table[TTileMode][B10G10R10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
   2469     table[TTileMode][B10G10R10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
   2470     table[TTileMode][B10G10R10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
   2471     table[TTileMode][R8G8B8_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
   2472     table[TTileMode][R8G8B8_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
   2473 }
   2474 
   2475 //////////////////////////////////////////////////////////////////////////
   2476 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
   2477 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
   2478 void InitStoreTilesTableDepth(
   2479     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
   2480 {
   2481    table[TTileMode][R32_FLOAT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
   2482    table[TTileMode][R32_FLOAT_X8X24_TYPELESS]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
   2483    table[TTileMode][R24_UNORM_X8_TYPELESS]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
   2484    table[TTileMode][R16_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
   2485 }
   2486 
   2487 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
   2488 void InitStoreTilesTableStencil(
   2489     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
   2490 {
   2491     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
   2492 }
   2493