Home | History | Annotate | Download | only in memory
      1 /****************************************************************************
      2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file StoreTile.h
     24 *
     25 * @brief Functionality for Store.
     26 *
     27 ******************************************************************************/
     28 #pragma once
     29 
     30 #include "common/os.h"
     31 #include "common/formats.h"
     32 #include "core/context.h"
     33 #include "core/rdtsc_core.h"
     34 #include "core/format_conversion.h"
     35 
     36 #include "memory/TilingFunctions.h"
     37 #include "memory/Convert.h"
     38 #include "core/multisample.h"
     39 
     40 #include <array>
     41 #include <sstream>
     42 
     43 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
     44 
     45 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
     46 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
     47 
     48 //////////////////////////////////////////////////////////////////////////
     49 /// Store Raster Tile Function Tables.
     50 //////////////////////////////////////////////////////////////////////////
     51 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
     52 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
     53 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
     54 
     55 void InitStoreTilesTable_Linear_1();
     56 void InitStoreTilesTable_Linear_2();
     57 void InitStoreTilesTable_TileX_1();
     58 void InitStoreTilesTable_TileX_2();
     59 void InitStoreTilesTable_TileY_1();
     60 void InitStoreTilesTable_TileY_2();
     61 void InitStoreTilesTable_TileW();
     62 void InitStoreTilesTable();
     63 
     64 //////////////////////////////////////////////////////////////////////////
     65 /// StorePixels
     66 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
     67 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
     68 /// @param ppDsts   - Array of destination pointers.  Each pointer is
     69 ///                   to a single row of at most 16B.
     70 /// @tparam NumDests - Number of destination pointers.  Each pair of
     71 ///                    pointers is for a 16-byte column of two rows.
     72 //////////////////////////////////////////////////////////////////////////
     73 template <size_t PixelSize, size_t NumDests>
     74 struct StorePixels
     75 {
     76     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
     77 };
     78 
     79 //////////////////////////////////////////////////////////////////////////
     80 /// StorePixels (32-bit pixel specialization)
     81 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
     82 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
     83 /// @param ppDsts   - Array of destination pointers.  Each pointer is
     84 ///                   to a single row of at most 16B.
     85 /// @tparam NumDests - Number of destination pointers.  Each pair of
     86 ///                    pointers is for a 16-byte column of two rows.
     87 //////////////////////////////////////////////////////////////////////////
     88 template <>
     89 struct StorePixels<8, 2>
     90 {
     91     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
     92     {
     93         // Each 4-pixel row is 4 bytes.
     94         const uint16_t* pPixSrc = (const uint16_t*)pSrc;
     95 
     96         // Unswizzle from SWR-Z order
     97         uint16_t* pRow = (uint16_t*)ppDsts[0];
     98         pRow[0] = pPixSrc[0];
     99         pRow[1] = pPixSrc[2];
    100 
    101         pRow = (uint16_t*)ppDsts[1];
    102         pRow[0] = pPixSrc[1];
    103         pRow[1] = pPixSrc[3];
    104     }
    105 };
    106 
    107 #if USE_8x2_TILE_BACKEND
    108 template <>
    109 struct StorePixels<8, 4>
    110 {
    111     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    112     {
    113         // 8 x 2 bytes = 16 bytes, 16 pixels
    114         const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
    115 
    116         uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
    117 
    118         // Unswizzle from SWR-Z order
    119         ppDsts16[0][0] = pSrc16[0];     // 0 1
    120         ppDsts16[0][1] = pSrc16[2];     // 4 5
    121 
    122         ppDsts16[1][0] = pSrc16[1];     // 2 3
    123         ppDsts16[1][1] = pSrc16[3];     // 6 7
    124 
    125         ppDsts16[2][0] = pSrc16[4];     // 8 9
    126         ppDsts16[2][1] = pSrc16[6];     // C D
    127 
    128         ppDsts16[3][0] = pSrc16[5];     // A B
    129         ppDsts16[3][1] = pSrc16[7];     // E F
    130     }
    131 };
    132 
    133 #endif
    134 //////////////////////////////////////////////////////////////////////////
    135 /// StorePixels (32-bit pixel specialization)
    136 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
    137 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
    138 /// @param ppDsts   - Array of destination pointers.  Each pointer is
    139 ///                   to a single row of at most 16B.
    140 /// @tparam NumDests - Number of destination pointers.  Each pair of
    141 ///                    pointers is for a 16-byte column of two rows.
    142 //////////////////////////////////////////////////////////////////////////
    143 template <>
    144 struct StorePixels<16, 2>
    145 {
    146     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
    147     {
    148         // Each 4-pixel row is 8 bytes.
    149         const uint32_t* pPixSrc = (const uint32_t*)pSrc;
    150 
    151         // Unswizzle from SWR-Z order
    152         uint32_t* pRow = (uint32_t*)ppDsts[0];
    153         pRow[0] = pPixSrc[0];
    154         pRow[1] = pPixSrc[2];
    155 
    156         pRow = (uint32_t*)ppDsts[1];
    157         pRow[0] = pPixSrc[1];
    158         pRow[1] = pPixSrc[3];
    159     }
    160 };
    161 
    162 #if USE_8x2_TILE_BACKEND
    163 template <>
    164 struct StorePixels<16, 4>
    165 {
    166     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    167     {
    168         // 8 x 4 bytes = 32 bytes, 16 pixels
    169         const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
    170 
    171         uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
    172 
    173         // Unswizzle from SWR-Z order
    174         ppDsts32[0][0] = pSrc32[0];     // 0 1
    175         ppDsts32[0][1] = pSrc32[2];     // 4 5
    176 
    177         ppDsts32[1][0] = pSrc32[1];     // 2 3
    178         ppDsts32[1][1] = pSrc32[3];     // 6 7
    179 
    180         ppDsts32[2][0] = pSrc32[4];     // 8 9
    181         ppDsts32[2][1] = pSrc32[6];     // C D
    182 
    183         ppDsts32[3][0] = pSrc32[5];     // A B
    184         ppDsts32[3][1] = pSrc32[7];     // E F
    185     }
    186 };
    187 
    188 #endif
    189 //////////////////////////////////////////////////////////////////////////
    190 /// StorePixels (32-bit pixel specialization)
    191 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
    192 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
    193 /// @param ppDsts   - Array of destination pointers.  Each pointer is
    194 ///                   to a single row of at most 16B.
    195 /// @tparam NumDests - Number of destination pointers.  Each pair of
    196 ///                    pointers is for a 16-byte column of two rows.
    197 //////////////////////////////////////////////////////////////////////////
    198 template <>
    199 struct StorePixels<32, 2>
    200 {
    201     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
    202     {
    203         // Each 4-pixel row is 16-bytes
    204         simd4scalari *pZRow01 = (simd4scalari*)pSrc;
    205         simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
    206         simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
    207 
    208         simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
    209         simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
    210 
    211         SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
    212         SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
    213     }
    214 };
    215 
    216 #if USE_8x2_TILE_BACKEND
    217 template <>
    218 struct StorePixels<32, 4>
    219 {
    220     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    221     {
    222         // 4 x 16 bytes = 64 bytes, 16 pixels
    223         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
    224 
    225         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
    226 
    227         // Unswizzle from SWR-Z order
    228         simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]);                        // 0 1 2 3
    229         simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]);                        // 4 5 6 7
    230         simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]);                        // 8 9 A B
    231         simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]);                        // C D E F
    232 
    233         SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1));   // 0 1 4 5
    234         SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1));   // 2 3 6 7
    235         SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3));   // 8 9 C D
    236         SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3));   // A B E F
    237     }
    238 };
    239 
    240 #endif
    241 //////////////////////////////////////////////////////////////////////////
    242 /// StorePixels (32-bit pixel specialization)
    243 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
    244 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
    245 /// @param ppDsts   - Array of destination pointers.  Each pointer is
    246 ///                   to a single row of at most 16B.
    247 /// @tparam NumDests - Number of destination pointers.  Each pair of
    248 ///                    pointers is for a 16-byte column of two rows.
    249 //////////////////////////////////////////////////////////////////////////
    250 template <>
    251 struct StorePixels<64, 4>
    252 {
    253     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
    254     {
    255         // Each 4-pixel row is 32 bytes.
    256         const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
    257 
    258         // order of pointers match SWR-Z layout
    259         simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
    260         *pvDsts[0] = pPixSrc[0];
    261         *pvDsts[1] = pPixSrc[1];
    262         *pvDsts[2] = pPixSrc[2];
    263         *pvDsts[3] = pPixSrc[3];
    264     }
    265 };
    266 
    267 #if USE_8x2_TILE_BACKEND
    268 template <>
    269 struct StorePixels<64, 8>
    270 {
    271     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
    272     {
    273         // 8 x 16 bytes = 128 bytes, 16 pixels
    274         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
    275 
    276         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
    277 
    278         // order of pointers match SWR-Z layout
    279         *ppDsts128[0] = pSrc128[0];     // 0 1
    280         *ppDsts128[1] = pSrc128[1];     // 2 3
    281         *ppDsts128[2] = pSrc128[2];     // 4 5
    282         *ppDsts128[3] = pSrc128[3];     // 6 7
    283         *ppDsts128[4] = pSrc128[4];     // 8 9
    284         *ppDsts128[5] = pSrc128[5];     // A B
    285         *ppDsts128[6] = pSrc128[6];     // C D
    286         *ppDsts128[7] = pSrc128[7];     // E F
    287     }
    288 };
    289 
    290 #endif
    291 //////////////////////////////////////////////////////////////////////////
    292 /// StorePixels (32-bit pixel specialization)
    293 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
    294 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
    295 /// @param ppDsts   - Array of destination pointers.  Each pointer is
    296 ///                   to a single row of at most 16B.
    297 /// @tparam NumDests - Number of destination pointers.  Each pair of
    298 ///                    pointers is for a 16-byte column of two rows.
    299 //////////////////////////////////////////////////////////////////////////
    300 template <>
    301 struct StorePixels<128, 8>
    302 {
    303     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
    304     {
    305         // Each 4-pixel row is 64 bytes.
    306         const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
    307 
    308         // Unswizzle from SWR-Z order
    309         simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
    310         *pvDsts[0] = pPixSrc[0];
    311         *pvDsts[1] = pPixSrc[2];
    312         *pvDsts[2] = pPixSrc[1];
    313         *pvDsts[3] = pPixSrc[3];
    314         *pvDsts[4] = pPixSrc[4];
    315         *pvDsts[5] = pPixSrc[6];
    316         *pvDsts[6] = pPixSrc[5];
    317         *pvDsts[7] = pPixSrc[7];
    318     }
    319 };
    320 
    321 #if USE_8x2_TILE_BACKEND
    322 template <>
    323 struct StorePixels<128, 16>
    324 {
    325     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
    326     {
    327         // 16 x 16 bytes = 256 bytes, 16 pixels
    328         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
    329 
    330         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
    331 
    332         for (uint32_t i = 0; i < 16; i += 4)
    333         {
    334             *ppDsts128[i + 0] = pSrc128[i + 0];
    335             *ppDsts128[i + 1] = pSrc128[i + 2];
    336             *ppDsts128[i + 2] = pSrc128[i + 1];
    337             *ppDsts128[i + 3] = pSrc128[i + 3];
    338         }
    339     }
    340 };
    341 
    342 #endif
    343 //////////////////////////////////////////////////////////////////////////
    344 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
    345 //////////////////////////////////////////////////////////////////////////
    346 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
    347 struct ConvertPixelsSOAtoAOS
    348 {
    349     //////////////////////////////////////////////////////////////////////////
    350     /// @brief Converts a SIMD from the Hot Tile to the destination format
    351     ///        and converts from SOA to AOS.
    352     /// @param pSrc - Pointer to raster tile.
    353     /// @param pDst - Pointer to destination surface or deswizzling buffer.
    354     template <size_t NumDests>
    355     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    356     {
    357 #if USE_8x2_TILE_BACKEND
    358         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
    359 
    360         OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
    361         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    362 
    363         // Convert from SrcFormat --> DstFormat
    364         simd16vector src;
    365         LoadSOA<SrcFormat>(pSrc, src);
    366         StoreSOA<DstFormat>(src, soaTile);
    367 
    368         // Convert from SOA --> AOS
    369         FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
    370 
    371 #else
    372         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
    373 
    374         OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
    375         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    376 
    377         // Convert from SrcFormat --> DstFormat
    378         simdvector src;
    379         LoadSOA<SrcFormat>(pSrc, src);
    380         StoreSOA<DstFormat>(src, soaTile);
    381 
    382         // Convert from SOA --> AOS
    383         FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
    384 
    385 #endif
    386         // Store data into destination
    387         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
    388     }
    389 };
    390 
    391 //////////////////////////////////////////////////////////////////////////
    392 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
    393 /// Specialization for no format conversion
    394 //////////////////////////////////////////////////////////////////////////
    395 template<SWR_FORMAT Format>
    396 struct ConvertPixelsSOAtoAOS<Format, Format>
    397 {
    398     //////////////////////////////////////////////////////////////////////////
    399     /// @brief Converts a SIMD from the Hot Tile to the destination format
    400     ///        and converts from SOA to AOS.
    401     /// @param pSrc - Pointer to raster tile.
    402     /// @param pDst - Pointer to destination surface or deswizzling buffer.
    403     template <size_t NumDests>
    404     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    405     {
    406 #if USE_8x2_TILE_BACKEND
    407         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
    408 
    409         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    410 
    411         // Convert from SOA --> AOS
    412         FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
    413 
    414 #else
    415         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
    416 
    417         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    418 
    419         // Convert from SOA --> AOS
    420         FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
    421 
    422 #endif
    423         // Store data into destination
    424         StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
    425     }
    426 };
    427 
    428 //////////////////////////////////////////////////////////////////////////
    429 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
    430 //////////////////////////////////////////////////////////////////////////
    431 template<>
    432 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
    433 {
    434     //////////////////////////////////////////////////////////////////////////
    435     /// @brief Converts a SIMD from the Hot Tile to the destination format
    436     ///        and converts from SOA to AOS.
    437     /// @param pSrc - Pointer to raster tile.
    438     /// @param pDst - Pointer to destination surface or deswizzling buffer.
    439     template <size_t NumDests>
    440     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    441     {
    442 #if USE_8x2_TILE_BACKEND
    443         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
    444         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
    445 
    446         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
    447 
    448         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    449 
    450         // Load hot-tile
    451         simd16vector src, dst;
    452         LoadSOA<SrcFormat>(pSrc, src);
    453 
    454         // deswizzle
    455         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
    456         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
    457         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
    458 
    459         // clamp
    460         dst.x = Clamp<DstFormat>(dst.x, 0);
    461         dst.y = Clamp<DstFormat>(dst.y, 1);
    462         dst.z = Clamp<DstFormat>(dst.z, 2);
    463 
    464         // normalize
    465         dst.x = Normalize<DstFormat>(dst.x, 0);
    466         dst.y = Normalize<DstFormat>(dst.y, 1);
    467         dst.z = Normalize<DstFormat>(dst.z, 2);
    468 
    469         // pack
    470         simd16scalari packed = _simd16_castps_si(dst.x);
    471 
    472         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
    473         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
    474 
    475         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
    476         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
    477 
    478         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
    479         uint32_t *pPacked = (uint32_t*)&packed;
    480         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
    481         for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
    482         {
    483             *pAosTile++ = *pPacked++;
    484         }
    485 
    486 #else
    487         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
    488         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
    489         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
    490 
    491         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    492 
    493         // Load hot-tile
    494         simdvector src, dst;
    495         LoadSOA<SrcFormat>(pSrc, src);
    496 
    497         // deswizzle
    498         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
    499         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
    500         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
    501 
    502         // clamp
    503         dst.x = Clamp<DstFormat>(dst.x, 0);
    504         dst.y = Clamp<DstFormat>(dst.y, 1);
    505         dst.z = Clamp<DstFormat>(dst.z, 2);
    506 
    507         // normalize
    508         dst.x = Normalize<DstFormat>(dst.x, 0);
    509         dst.y = Normalize<DstFormat>(dst.y, 1);
    510         dst.z = Normalize<DstFormat>(dst.z, 2);
    511 
    512         // pack
    513         simdscalari packed = _simd_castps_si(dst.x);
    514         packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetConstBPC(0)));
    515         packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetConstBPC(0) +
    516                                                                               FormatTraits<DstFormat>::GetConstBPC(1)));
    517 
    518         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
    519         uint32_t *pPacked = (uint32_t*)&packed;
    520         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
    521         for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
    522         {
    523             *pAosTile++ = *pPacked++;
    524         }
    525 
    526 #endif
    527         // Store data into destination
    528         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
    529     }
    530 };
    531 
    532 //////////////////////////////////////////////////////////////////////////
    533 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
    534 //////////////////////////////////////////////////////////////////////////
    535 template<>
    536 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
    537 {
    538     static const SWR_FORMAT SrcFormat = R32_FLOAT;
    539     static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
    540 
    541     //////////////////////////////////////////////////////////////////////////
    542     /// @brief Converts a SIMD from the Hot Tile to the destination format
    543     ///        and converts from SOA to AOS.
    544     /// @param pSrc - Pointer to raster tile.
    545     /// @param pDst - Pointer to destination surface or deswizzling buffer.
    546     template <size_t NumDests>
    547     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    548     {
    549 #if USE_8x2_TILE_BACKEND
    550         simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
    551 
    552         // clamp
    553         const simd16scalar zero = _simd16_setzero_ps();
    554         const simd16scalar ones = _simd16_set1_ps(1.0f);
    555 
    556         comp = _simd16_max_ps(comp, zero);
    557         comp = _simd16_min_ps(comp, ones);
    558 
    559         // normalize
    560         comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    561 
    562         simd16scalari temp = _simd16_cvtps_epi32(comp);
    563 
    564         // swizzle
    565         temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
    566 
    567         // merge/store data into destination but don't overwrite the X8 bits
    568         simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
    569         simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
    570 
    571         simd16scalari dest = _simd16_setzero_si();
    572 
    573         dest = _simd16_insert_si(dest, destlo, 0);
    574         dest = _simd16_insert_si(dest, desthi, 1);
    575 
    576         simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
    577 
    578         dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
    579 
    580         _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
    581         _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
    582 #else
    583         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
    584 
    585         OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
    586         OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
    587 
    588         // Convert from SrcFormat --> DstFormat
    589         simdvector src;
    590         LoadSOA<SrcFormat>(pSrc, src);
    591         StoreSOA<DstFormat>(src, soaTile);
    592 
    593         // Convert from SOA --> AOS
    594         FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
    595 
    596         // Store data into destination but don't overwrite the X8 bits
    597         // Each 4-pixel row is 16-bytes
    598         simd4scalari *pZRow01 = (simd4scalari*)aosTile;
    599         simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
    600         simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
    601 
    602         simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
    603         simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
    604 
    605         simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
    606         simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
    607 
    608         simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
    609 
    610         vDst0 = SIMD128::andnot_si(vMask, vDst0);
    611         vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
    612         vDst1 = SIMD128::andnot_si(vMask, vDst1);
    613         vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
    614 
    615         SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
    616         SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
    617 #endif
    618     }
    619 };
    620 
    621 #if USE_8x2_TILE_BACKEND
    622 template<SWR_FORMAT DstFormat>
    623 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
    624 {
    625     // swizzle rgba -> bgra while we load
    626     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
    627     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
    628     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
    629     simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
    630 
    631     // clamp
    632     const simd16scalar zero = _simd16_setzero_ps();
    633     const simd16scalar ones = _simd16_set1_ps(1.0f);
    634 
    635     comp0 = _simd16_max_ps(comp0, zero);
    636     comp0 = _simd16_min_ps(comp0, ones);
    637 
    638     comp1 = _simd16_max_ps(comp1, zero);
    639     comp1 = _simd16_min_ps(comp1, ones);
    640 
    641     comp2 = _simd16_max_ps(comp2, zero);
    642     comp2 = _simd16_min_ps(comp2, ones);
    643 
    644     comp3 = _simd16_max_ps(comp3, zero);
    645     comp3 = _simd16_min_ps(comp3, ones);
    646 
    647     // gamma-correct only rgb
    648     if (FormatTraits<DstFormat>::isSRGB)
    649     {
    650         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
    651         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
    652         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
    653     }
    654 
    655     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
    656     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    657     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
    658     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
    659     comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
    660 
    661     // moving to 16 wide integer vector types
    662     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
    663     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
    664     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
    665     simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
    666 
    667     // SOA to AOS conversion
    668     src1 = _simd16_slli_epi32(src1,  8);
    669     src2 = _simd16_slli_epi32(src2, 16);
    670     src3 = _simd16_slli_epi32(src3, 24);
    671 
    672     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
    673 
    674     // de-swizzle conversion
    675 #if 1
    676     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
    677     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
    678 
    679     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
    680 
    681 #else
    682     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
    683 
    684 #endif
    685     // store 8x2 memory order:
    686     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
    687     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
    688     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
    689     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
    690 }
    691 
    692 #endif
    693 template<SWR_FORMAT DstFormat>
    694 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
    695 {
    696     static const uint32_t offset = sizeof(simdscalar);
    697 
    698     // swizzle rgba -> bgra while we load
    699     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
    700     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
    701     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
    702     simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
    703 
    704     // clamp
    705     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
    706     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
    707 
    708     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
    709     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
    710 
    711     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
    712     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
    713 
    714     vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
    715     vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
    716 
    717     if (FormatTraits<DstFormat>::isSRGB)
    718     {
    719         // Gamma-correct only rgb
    720         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
    721         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
    722         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
    723     }
    724 
    725     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
    726     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    727     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
    728     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
    729     vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
    730 
    731     // moving to 8 wide integer vector types
    732     simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
    733     simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
    734     simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
    735     simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
    736 
    737 #if KNOB_ARCH <= KNOB_ARCH_AVX
    738 
    739     // splitting into two sets of 4 wide integer vector types
    740     // because AVX doesn't have instructions to support this operation at 8 wide
    741     simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
    742     simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
    743     simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
    744     simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
    745 
    746     simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
    747     simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
    748     simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
    749     simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
    750 
    751     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
    752     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
    753     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
    754     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
    755     srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
    756     srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
    757 
    758     srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
    759     srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
    760 
    761     srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
    762     srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
    763 
    764     srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
    765     srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
    766 
    767     // unpack into rows that get the tiling order correct
    768     simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
    769     simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
    770 
    771     simdscalari final = _mm256_castsi128_si256(vRow00);
    772     final = _mm256_insertf128_si256(final, vRow10, 1);
    773 
    774 #else
    775 
    776     // logic is as above, only wider
    777     src1 = _mm256_slli_si256(src1, 1);
    778     src2 = _mm256_slli_si256(src2, 2);
    779     src3 = _mm256_slli_si256(src3, 3);
    780 
    781     src0 = _mm256_or_si256(src0, src1);
    782     src2 = _mm256_or_si256(src2, src3);
    783 
    784     simdscalari final = _mm256_or_si256(src0, src2);
    785 
    786     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
    787     final = _mm256_permute4x64_epi64(final, 0xD8);
    788 #endif
    789 
    790     _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
    791 }
    792 
    793 #if USE_8x2_TILE_BACKEND
    794 template<SWR_FORMAT DstFormat>
    795 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
    796 {
    797     // swizzle rgba -> bgra while we load
    798     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
    799     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
    800     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
    801 
    802     // clamp
    803     const simd16scalar zero = _simd16_setzero_ps();
    804     const simd16scalar ones = _simd16_set1_ps(1.0f);
    805 
    806     comp0 = _simd16_max_ps(comp0, zero);
    807     comp0 = _simd16_min_ps(comp0, ones);
    808 
    809     comp1 = _simd16_max_ps(comp1, zero);
    810     comp1 = _simd16_min_ps(comp1, ones);
    811 
    812     comp2 = _simd16_max_ps(comp2, zero);
    813     comp2 = _simd16_min_ps(comp2, ones);
    814 
    815     // gamma-correct only rgb
    816     if (FormatTraits<DstFormat>::isSRGB)
    817     {
    818         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
    819         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
    820         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
    821     }
    822 
    823     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
    824     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    825     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
    826     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
    827 
    828     // moving to 16 wide integer vector types
    829     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
    830     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
    831     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
    832 
    833     // SOA to AOS conversion
    834     src1 = _simd16_slli_epi32(src1,  8);
    835     src2 = _simd16_slli_epi32(src2, 16);
    836 
    837     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2);                       // 0 1 2 3 4 5 6 7 8 9 A B C D E F
    838 
    839     // de-swizzle conversion
    840 #if 1
    841     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
    842     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
    843 
    844     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
    845 
    846 #else
    847     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
    848 
    849 #endif
    850     // store 8x2 memory order:
    851     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
    852     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
    853     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
    854     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
    855 }
    856 
    857 #endif
    858 template<SWR_FORMAT DstFormat>
    859 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
    860 {
    861     static const uint32_t offset = sizeof(simdscalar);
    862 
    863     // swizzle rgba -> bgra while we load
    864     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
    865     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
    866     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
    867                                                                                                             // clamp
    868     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
    869     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
    870 
    871     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
    872     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
    873 
    874     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
    875     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
    876 
    877     if (FormatTraits<DstFormat>::isSRGB)
    878     {
    879         // Gamma-correct only rgb
    880         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
    881         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
    882         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
    883     }
    884 
    885     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
    886     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
    887     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
    888     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
    889 
    890     // moving to 8 wide integer vector types
    891     simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
    892     simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
    893     simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
    894 
    895 #if KNOB_ARCH <= KNOB_ARCH_AVX
    896 
    897     // splitting into two sets of 4 wide integer vector types
    898     // because AVX doesn't have instructions to support this operation at 8 wide
    899     simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
    900     simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
    901     simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
    902 
    903     simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
    904     simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
    905     simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
    906 
    907     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
    908     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
    909     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
    910     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
    911 
    912     srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
    913 
    914     srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
    915 
    916     srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
    917     srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
    918 
    919     // unpack into rows that get the tiling order correct
    920     simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
    921     simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
    922 
    923     simdscalari final = _mm256_castsi128_si256(vRow00);
    924     final = _mm256_insertf128_si256(final, vRow10, 1);
    925 
    926 #else
    927 
    928                                               // logic is as above, only wider
    929     src1 = _mm256_slli_si256(src1, 1);
    930     src2 = _mm256_slli_si256(src2, 2);
    931 
    932     src0 = _mm256_or_si256(src0, src1);
    933 
    934     simdscalari final = _mm256_or_si256(src0, src2);
    935 
    936     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
    937     final = _mm256_permute4x64_epi64(final, 0xD8);
    938 
    939 #endif
    940 
    941     _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
    942 }
    943 
    944 template<>
    945 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
    946 {
    947     template <size_t NumDests>
    948     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    949     {
    950 #if USE_8x2_TILE_BACKEND
    951         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
    952 #else
    953         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
    954 #endif
    955     }
    956 };
    957 
    958 template<>
    959 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
    960 {
    961     template <size_t NumDests>
    962     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    963     {
    964 #if USE_8x2_TILE_BACKEND
    965         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
    966 #else
    967         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
    968 #endif
    969     }
    970 };
    971 
    972 template<>
    973 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
    974 {
    975     template <size_t NumDests>
    976     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    977     {
    978 #if USE_8x2_TILE_BACKEND
    979         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
    980 #else
    981         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
    982 #endif
    983     }
    984 };
    985 
    986 template<>
    987 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
    988 {
    989     template <size_t NumDests>
    990     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
    991     {
    992 #if USE_8x2_TILE_BACKEND
    993         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
    994 #else
    995         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
    996 #endif
    997     }
    998 };
    999 
   1000 template<>
   1001 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
   1002 {
   1003     template <size_t NumDests>
   1004     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
   1005     {
   1006 #if USE_8x2_TILE_BACKEND
   1007         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
   1008 #else
   1009         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
   1010 #endif
   1011     }
   1012 };
   1013 
   1014 template<>
   1015 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
   1016 {
   1017     template <size_t NumDests>
   1018     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
   1019     {
   1020 #if USE_8x2_TILE_BACKEND
   1021         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
   1022 #else
   1023         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
   1024 #endif
   1025     }
   1026 };
   1027 
   1028 template<>
   1029 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
   1030 {
   1031     template <size_t NumDests>
   1032     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
   1033     {
   1034 #if USE_8x2_TILE_BACKEND
   1035         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
   1036 #else
   1037         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
   1038 #endif
   1039     }
   1040 };
   1041 
   1042 template<>
   1043 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
   1044 {
   1045     template <size_t NumDests>
   1046     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
   1047     {
   1048 #if USE_8x2_TILE_BACKEND
   1049         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
   1050 #else
   1051         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
   1052 #endif
   1053     }
   1054 };
   1055 
   1056 //////////////////////////////////////////////////////////////////////////
   1057 /// StoreRasterTile
   1058 //////////////////////////////////////////////////////////////////////////
   1059 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1060 struct StoreRasterTile
   1061 {
   1062     //////////////////////////////////////////////////////////////////////////
   1063     /// @brief Retrieve color from hot tile source which is always float.
   1064     /// @param pSrc - Pointer to raster tile.
   1065     /// @param x, y - Coordinates to raster tile.
   1066     /// @param output - output color
   1067     INLINE static void GetSwizzledSrcColor(
   1068         uint8_t* pSrc,
   1069         uint32_t x, uint32_t y,
   1070         float outputColor[4])
   1071     {
   1072 #if USE_8x2_TILE_BACKEND
   1073         typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
   1074 
   1075         SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
   1076 
   1077         // Compute which simd tile we're accessing within 8x8 tile.
   1078         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
   1079         uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
   1080 
   1081         SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
   1082 
   1083         uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
   1084 
   1085         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
   1086 #else
   1087         typedef SimdTile<SrcFormat, DstFormat> SimdT;
   1088 
   1089         SimdT* pSrcSimdTiles = (SimdT*)pSrc;
   1090 
   1091         // Compute which simd tile we're accessing within 8x8 tile.
   1092         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
   1093         uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
   1094 
   1095         SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
   1096 
   1097         uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
   1098 
   1099         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
   1100 #endif
   1101     }
   1102 
   1103     //////////////////////////////////////////////////////////////////////////
   1104     /// @brief Stores an 8x8 raster tile to the destination surface.
   1105     /// @param pSrc - Pointer to raster tile.
   1106     /// @param pDstSurface - Destination surface state
   1107     /// @param x, y - Coordinates to raster tile.
   1108     INLINE static void Store(
   1109         uint8_t *pSrc,
   1110         SWR_SURFACE_STATE* pDstSurface,
   1111         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
   1112     {
   1113         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1114         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1115 
   1116         // For each raster tile pixel (rx, ry)
   1117         for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
   1118         {
   1119             for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
   1120             {
   1121                 // Perform bounds checking.
   1122                 if (((x + rx) < lodWidth) &&
   1123                     ((y + ry) < lodHeight))
   1124                 {
   1125                     float srcColor[4];
   1126                     GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
   1127 
   1128                     uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
   1129                         pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1130                         sampleNum, pDstSurface->lod, pDstSurface);
   1131                     {
   1132                         ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
   1133                     }
   1134                 }
   1135             }
   1136         }
   1137     }
   1138 
   1139     //////////////////////////////////////////////////////////////////////////
   1140     /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
   1141     /// @param pSrc - Pointer to raster tile.
   1142     /// @param pDstSurface - Destination surface state
   1143     /// @param x, y - Coordinates to raster tile.
   1144     /// @param sampleOffset - Offset between adjacent multisamples
   1145     INLINE static void Resolve(
   1146         uint8_t *pSrc,
   1147         SWR_SURFACE_STATE* pDstSurface,
   1148         uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
   1149     {
   1150         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1151         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1152 
   1153         float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
   1154 
   1155         // For each raster tile pixel (rx, ry)
   1156         for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
   1157         {
   1158             for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
   1159             {
   1160                 // Perform bounds checking.
   1161                 if (((x + rx) < lodWidth) &&
   1162                         ((y + ry) < lodHeight))
   1163                 {
   1164                     // Sum across samples
   1165                     float resolveColor[4] = {0};
   1166                     for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
   1167                     {
   1168                         float sampleColor[4] = {0};
   1169                         uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
   1170                         GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
   1171                         resolveColor[0] += sampleColor[0];
   1172                         resolveColor[1] += sampleColor[1];
   1173                         resolveColor[2] += sampleColor[2];
   1174                         resolveColor[3] += sampleColor[3];
   1175                     }
   1176 
   1177                     // Divide by numSamples to average
   1178                     resolveColor[0] *= oneOverNumSamples;
   1179                     resolveColor[1] *= oneOverNumSamples;
   1180                     resolveColor[2] *= oneOverNumSamples;
   1181                     resolveColor[3] *= oneOverNumSamples;
   1182 
   1183                     // Use the resolve surface state
   1184                     SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
   1185                     uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
   1186                         pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
   1187                         0, pResolveSurface->lod, pResolveSurface);
   1188                     {
   1189                         ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
   1190                     }
   1191                 }
   1192             }
   1193         }
   1194     }
   1195 
   1196 };
   1197 
   1198 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1199 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
   1200 {};
   1201 
   1202 //////////////////////////////////////////////////////////////////////////
   1203 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
   1204 //////////////////////////////////////////////////////////////////////////
   1205 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1206 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
   1207 {
   1208     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
   1209     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1210     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1211 
   1212     //////////////////////////////////////////////////////////////////////////
   1213     /// @brief Stores an 8x8 raster tile to the destination surface.
   1214     /// @param pSrc - Pointer to raster tile.
   1215     /// @param pDstSurface - Destination surface state
   1216     /// @param x, y - Coordinates to raster tile.
   1217     INLINE static void Store(
   1218         uint8_t *pSrc,
   1219         SWR_SURFACE_STATE* pDstSurface,
   1220         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1221     {
   1222         // Punt non-full tiles to generic store
   1223         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1224         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1225 
   1226         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1227         {
   1228             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1229         }
   1230 
   1231         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1232             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1233 #if USE_8x2_TILE_BACKEND
   1234 
   1235         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1236         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1237 
   1238         uint8_t* ppDsts[] =
   1239         {
   1240             pDst,                                           // row 0, col 0
   1241             pDst + pDstSurface->pitch,                      // row 1, col 0
   1242             pDst + dx / 2,                                  // row 0, col 1
   1243             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
   1244         };
   1245 
   1246         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1247         {
   1248             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
   1249             {
   1250                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1251 
   1252                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1253 
   1254                 ppDsts[0] += dx;
   1255                 ppDsts[1] += dx;
   1256                 ppDsts[2] += dx;
   1257                 ppDsts[3] += dx;
   1258             }
   1259 
   1260             ppDsts[0] += dy;
   1261             ppDsts[1] += dy;
   1262             ppDsts[2] += dy;
   1263             ppDsts[3] += dy;
   1264         }
   1265 #else
   1266         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
   1267 
   1268         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1269         {
   1270             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
   1271 
   1272             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1273             {
   1274                 // Format conversion and convert from SOA to AOS, and store the rows.
   1275                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
   1276 
   1277                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1278                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1279                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
   1280             }
   1281 
   1282             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
   1283             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
   1284         }
   1285 #endif
   1286     }
   1287 };
   1288 
   1289 //////////////////////////////////////////////////////////////////////////
   1290 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
   1291 //////////////////////////////////////////////////////////////////////////
   1292 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1293 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
   1294 {
   1295     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
   1296     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1297     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1298 
   1299     //////////////////////////////////////////////////////////////////////////
   1300     /// @brief Stores an 8x8 raster tile to the destination surface.
   1301     /// @param pSrc - Pointer to raster tile.
   1302     /// @param pDstSurface - Destination surface state
   1303     /// @param x, y - Coordinates to raster tile.
   1304     INLINE static void Store(
   1305         uint8_t *pSrc,
   1306         SWR_SURFACE_STATE* pDstSurface,
   1307         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1308     {
   1309         // Punt non-full tiles to generic store
   1310         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1311         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1312 
   1313         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1314         {
   1315             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1316         }
   1317 
   1318         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1319             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1320 #if USE_8x2_TILE_BACKEND
   1321 
   1322         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1323         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1324 
   1325         uint8_t* ppDsts[] =
   1326         {
   1327             pDst,                                           // row 0, col 0
   1328             pDst + pDstSurface->pitch,                      // row 1, col 0
   1329             pDst + dx / 2,                                  // row 0, col 1
   1330             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
   1331         };
   1332 
   1333         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1334         {
   1335             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
   1336             {
   1337                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1338 
   1339                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1340 
   1341                 ppDsts[0] += dx;
   1342                 ppDsts[1] += dx;
   1343                 ppDsts[2] += dx;
   1344                 ppDsts[3] += dx;
   1345             }
   1346 
   1347             ppDsts[0] += dy;
   1348             ppDsts[1] += dy;
   1349             ppDsts[2] += dy;
   1350             ppDsts[3] += dy;
   1351         }
   1352 #else
   1353         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
   1354 
   1355         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1356         {
   1357             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
   1358 
   1359             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1360             {
   1361                 // Format conversion and convert from SOA to AOS, and store the rows.
   1362                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
   1363 
   1364                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1365                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1366                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
   1367             }
   1368 
   1369             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
   1370             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
   1371         }
   1372 #endif
   1373     }
   1374 };
   1375 
   1376 //////////////////////////////////////////////////////////////////////////
   1377 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
   1378 //////////////////////////////////////////////////////////////////////////
   1379 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1380 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
   1381 {
   1382     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
   1383     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1384     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1385 
   1386     //////////////////////////////////////////////////////////////////////////
   1387     /// @brief Stores an 8x8 raster tile to the destination surface.
   1388     /// @param pSrc - Pointer to raster tile.
   1389     /// @param pDstSurface - Destination surface state
   1390     /// @param x, y - Coordinates to raster tile.
   1391     INLINE static void Store(
   1392         uint8_t *pSrc,
   1393         SWR_SURFACE_STATE* pDstSurface,
   1394         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1395     {
   1396         // Punt non-full tiles to generic store
   1397         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1398         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1399 
   1400         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1401         {
   1402             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1403         }
   1404 
   1405         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1406             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1407 #if USE_8x2_TILE_BACKEND
   1408 
   1409         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1410         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1411 
   1412         uint8_t* ppDsts[] =
   1413         {
   1414             pDst,                                           // row 0, col 0
   1415             pDst + pDstSurface->pitch,                      // row 1, col 0
   1416             pDst + dx / 2,                                  // row 0, col 1
   1417             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
   1418         };
   1419 
   1420         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1421         {
   1422             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
   1423             {
   1424                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1425 
   1426                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1427 
   1428                 ppDsts[0] += dx;
   1429                 ppDsts[1] += dx;
   1430                 ppDsts[2] += dx;
   1431                 ppDsts[3] += dx;
   1432             }
   1433 
   1434             ppDsts[0] += dy;
   1435             ppDsts[1] += dy;
   1436             ppDsts[2] += dy;
   1437             ppDsts[3] += dy;
   1438         }
   1439 #else
   1440         uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
   1441 
   1442         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1443         {
   1444             uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
   1445 
   1446             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1447             {
   1448                 // Format conversion and convert from SOA to AOS, and store the rows.
   1449                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
   1450 
   1451                 ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1452                 ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1453                 pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
   1454             }
   1455 
   1456             ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
   1457             ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
   1458         }
   1459 #endif
   1460     }
   1461 };
   1462 
   1463 //////////////////////////////////////////////////////////////////////////
   1464 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
   1465 //////////////////////////////////////////////////////////////////////////
   1466 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1467 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
   1468 {
   1469     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
   1470     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1471     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1472     static const size_t MAX_DST_COLUMN_BYTES = 16;
   1473 #if !USE_8x2_TILE_BACKEND
   1474     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
   1475     static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1476 #endif
   1477 
   1478     //////////////////////////////////////////////////////////////////////////
   1479     /// @brief Stores an 8x8 raster tile to the destination surface.
   1480     /// @param pSrc - Pointer to raster tile.
   1481     /// @param pDstSurface - Destination surface state
   1482     /// @param x, y - Coordinates to raster tile.
   1483     INLINE static void Store(
   1484         uint8_t *pSrc,
   1485         SWR_SURFACE_STATE* pDstSurface,
   1486         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1487     {
   1488         // Punt non-full tiles to generic store
   1489         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1490         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1491 
   1492         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1493         {
   1494             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1495         }
   1496 
   1497         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1498             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1499 #if USE_8x2_TILE_BACKEND
   1500 
   1501         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1502         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
   1503 
   1504         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   1505         static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
   1506 
   1507         uint8_t *ppDsts[] =
   1508         {
   1509             pDst,                                                               // row 0, col 0
   1510             pDst + pDstSurface->pitch,                                          // row 1, col 0
   1511             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
   1512             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
   1513             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
   1514             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
   1515             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
   1516             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3                // row 1, col 3
   1517         };
   1518 
   1519         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1520         {
   1521             // Raster tile width is same as simd16 tile width
   1522             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1523 
   1524             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1525 
   1526             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1527 
   1528             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
   1529             {
   1530                 ppDsts[i] += dy;
   1531             }
   1532         }
   1533 #else
   1534         uint8_t* ppDsts[] =
   1535         {
   1536             pDst,                                               // row 0, col 0
   1537             pDst + pDstSurface->pitch,                          // row 1, col 0
   1538             pDst + MAX_DST_COLUMN_BYTES,                        // row 0, col 1
   1539             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,   // row 1, col 1
   1540         };
   1541 
   1542         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1543         {
   1544             uint8_t* ppStartRows[] =
   1545             {
   1546                 ppDsts[0],
   1547                 ppDsts[1],
   1548                 ppDsts[2],
   1549                 ppDsts[3],
   1550             };
   1551 
   1552             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1553             {
   1554                 // Format conversion and convert from SOA to AOS, and store the rows.
   1555                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1556 
   1557                 ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
   1558                 ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
   1559                 ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
   1560                 ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
   1561                 pSrc += SRC_COLUMN_BYTES;
   1562             }
   1563 
   1564             ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
   1565             ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
   1566             ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
   1567             ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
   1568         }
   1569 #endif
   1570     }
   1571 };
   1572 
   1573 //////////////////////////////////////////////////////////////////////////
   1574 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
   1575 //////////////////////////////////////////////////////////////////////////
   1576 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1577 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
   1578 {
   1579     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
   1580     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1581     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1582     static const size_t MAX_DST_COLUMN_BYTES = 16;
   1583 #if !USE_8x2_TILE_BACKEND
   1584     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
   1585     static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
   1586 #endif
   1587 
   1588     //////////////////////////////////////////////////////////////////////////
   1589     /// @brief Stores an 8x8 raster tile to the destination surface.
   1590     /// @param pSrc - Pointer to raster tile.
   1591     /// @param pDstSurface - Destination surface state
   1592     /// @param x, y - Coordinates to raster tile.
   1593     INLINE static void Store(
   1594         uint8_t *pSrc,
   1595         SWR_SURFACE_STATE* pDstSurface,
   1596         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1597     {
   1598         // Punt non-full tiles to generic store
   1599         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1600         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1601 
   1602         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1603         {
   1604             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1605         }
   1606 
   1607         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1608             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1609 #if USE_8x2_TILE_BACKEND
   1610 
   1611         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1612         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
   1613 
   1614         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   1615         static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
   1616 
   1617         uint8_t* ppDsts[] =
   1618         {
   1619             pDst,                                                               // row 0, col 0
   1620             pDst + pDstSurface->pitch,                                          // row 1, col 0
   1621             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
   1622             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
   1623             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
   1624             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
   1625             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
   1626             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,               // row 1, col 3
   1627             pDst + MAX_DST_COLUMN_BYTES * 4,                                    // row 0, col 4
   1628             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,               // row 1, col 4
   1629             pDst + MAX_DST_COLUMN_BYTES * 5,                                    // row 0, col 5
   1630             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,               // row 1, col 5
   1631             pDst + MAX_DST_COLUMN_BYTES * 6,                                    // row 0, col 6
   1632             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,               // row 1, col 6
   1633             pDst + MAX_DST_COLUMN_BYTES * 7,                                    // row 0, col 7
   1634             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,               // row 1, col 7
   1635         };
   1636 
   1637         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1638         {
   1639             // Raster tile width is same as simd16 tile width
   1640             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1641 
   1642             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1643 
   1644             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1645 
   1646             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
   1647             {
   1648                 ppDsts[i] += dy;
   1649             }
   1650         }
   1651 #else
   1652         struct DstPtrs
   1653         {
   1654             uint8_t* ppDsts[8];
   1655         } ptrs;
   1656 
   1657         // Need 8 pointers, 4 columns of 2 rows each
   1658         for (uint32_t y = 0; y < 2; ++y)
   1659         {
   1660             for (uint32_t x = 0; x < 4; ++x)
   1661             {
   1662                 ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
   1663             }
   1664         }
   1665 
   1666         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   1667         {
   1668             DstPtrs startPtrs = ptrs;
   1669 
   1670             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   1671             {
   1672                 // Format conversion and convert from SOA to AOS, and store the rows.
   1673                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
   1674 
   1675                 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
   1676                 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
   1677                 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
   1678                 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
   1679                 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
   1680                 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
   1681                 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
   1682                 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
   1683                 pSrc += SRC_COLUMN_BYTES;
   1684             }
   1685 
   1686             ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
   1687             ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
   1688             ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
   1689             ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
   1690             ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
   1691             ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
   1692             ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
   1693             ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
   1694         }
   1695 #endif
   1696     }
   1697 };
   1698 
   1699 //////////////////////////////////////////////////////////////////////////
   1700 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
   1701 //////////////////////////////////////////////////////////////////////////
   1702 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1703 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
   1704 {
   1705     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
   1706     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1707 
   1708     //////////////////////////////////////////////////////////////////////////
   1709     /// @brief Stores an 8x8 raster tile to the destination surface.
   1710     /// @param pSrc - Pointer to raster tile.
   1711     /// @param pDstSurface - Destination surface state
   1712     /// @param x, y - Coordinates to raster tile.
   1713     INLINE static void Store(
   1714         uint8_t *pSrc,
   1715         SWR_SURFACE_STATE* pDstSurface,
   1716         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1717     {
   1718         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   1719 
   1720         // Punt non-full tiles to generic store
   1721         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1722         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1723 
   1724         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1725         {
   1726             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1727         }
   1728 
   1729         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   1730         // We can compute the offsets to each column within the raster tile once and increment from these.
   1731 #if USE_8x2_TILE_BACKEND
   1732         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   1733         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1734             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1735 
   1736         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   1737 
   1738         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1739         uint8_t *ppDsts[] =
   1740         {
   1741             pDst,
   1742             pDst + DestRowWidthBytes,
   1743             pDst + DestRowWidthBytes / 4,
   1744             pDst + DestRowWidthBytes + DestRowWidthBytes / 4
   1745         };
   1746 
   1747         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1748         {
   1749             // Raster tile width is same as simd16 tile width
   1750             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1751 
   1752             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1753 
   1754             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1755 
   1756             ppDsts[0] += dy;
   1757             ppDsts[1] += dy;
   1758             ppDsts[2] += dy;
   1759             ppDsts[3] += dy;
   1760         }
   1761 #else
   1762         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   1763         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1764             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1765 
   1766         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   1767         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   1768 
   1769         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1770         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   1771         {
   1772             uint32_t rowOffset = row * DestRowWidthBytes;
   1773 
   1774             uint8_t* pRow = pCol0 + rowOffset;
   1775             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
   1776 
   1777             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1778             pSrc += pSrcInc;
   1779 
   1780             ppDsts[0] += DestRowWidthBytes / 4;
   1781             ppDsts[1] += DestRowWidthBytes / 4;
   1782 
   1783             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1784             pSrc += pSrcInc;
   1785         }
   1786 #endif
   1787     }
   1788 };
   1789 
   1790 //////////////////////////////////////////////////////////////////////////
   1791 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
   1792 //////////////////////////////////////////////////////////////////////////
   1793 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1794 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
   1795 {
   1796     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
   1797     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1798 
   1799     //////////////////////////////////////////////////////////////////////////
   1800     /// @brief Stores an 8x8 raster tile to the destination surface.
   1801     /// @param pSrc - Pointer to raster tile.
   1802     /// @param pDstSurface - Destination surface state
   1803     /// @param x, y - Coordinates to raster tile.
   1804     INLINE static void Store(
   1805         uint8_t *pSrc,
   1806         SWR_SURFACE_STATE* pDstSurface,
   1807         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1808     {
   1809         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   1810 
   1811         // Punt non-full tiles to generic store
   1812         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1813         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1814 
   1815         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1816         {
   1817             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1818         }
   1819 
   1820         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   1821         // We can compute the offsets to each column within the raster tile once and increment from these.
   1822 #if USE_8x2_TILE_BACKEND
   1823         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   1824         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1825             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1826 
   1827         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   1828 
   1829         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1830         uint8_t *ppDsts[] =
   1831         {
   1832             pDst,
   1833             pDst + DestRowWidthBytes,
   1834             pDst + DestRowWidthBytes / 2,
   1835             pDst + DestRowWidthBytes + DestRowWidthBytes / 2
   1836         };
   1837 
   1838         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1839         {
   1840             // Raster tile width is same as simd16 tile width
   1841             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   1842 
   1843             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1844 
   1845             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1846 
   1847             ppDsts[0] += dy;
   1848             ppDsts[1] += dy;
   1849             ppDsts[2] += dy;
   1850             ppDsts[3] += dy;
   1851         }
   1852 #else
   1853         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   1854         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1855             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1856 
   1857         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   1858         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   1859 
   1860         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   1861         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   1862         {
   1863             uint32_t rowOffset = row * DestRowWidthBytes;
   1864 
   1865             uint8_t* pRow = pCol0 + rowOffset;
   1866             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
   1867 
   1868             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1869             pSrc += pSrcInc;
   1870 
   1871             ppDsts[0] += DestRowWidthBytes / 2;
   1872             ppDsts[1] += DestRowWidthBytes / 2;
   1873 
   1874             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1875             pSrc += pSrcInc;
   1876         }
   1877 #endif
   1878     }
   1879 };
   1880 
   1881 //////////////////////////////////////////////////////////////////////////
   1882 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
   1883 //////////////////////////////////////////////////////////////////////////
   1884 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1885 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
   1886 {
   1887     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
   1888     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1889     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   1890 
   1891     //////////////////////////////////////////////////////////////////////////
   1892     /// @brief Stores an 8x8 raster tile to the destination surface.
   1893     /// @param pSrc - Pointer to raster tile.
   1894     /// @param pDstSurface - Destination surface state
   1895     /// @param x, y - Coordinates to raster tile.
   1896     INLINE static void Store(
   1897         uint8_t *pSrc,
   1898         SWR_SURFACE_STATE* pDstSurface,
   1899         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1900     {
   1901         static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
   1902 
   1903         // Punt non-full tiles to generic store
   1904         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1905         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1906 
   1907         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   1908         {
   1909             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   1910         }
   1911 
   1912         // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
   1913         // We can compute the offsets to each column within the raster tile once and increment from these.
   1914 #if USE_8x2_TILE_BACKEND
   1915         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1916             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1917 
   1918         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1919         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
   1920 
   1921         uint8_t* ppDsts[] =
   1922         {
   1923             pDst,                                           // row 0, col 0
   1924             pDst + DestRowWidthBytes,                       // row 1, col 0
   1925             pDst + dx / 2,                                  // row 0, col 1
   1926             pDst + DestRowWidthBytes + dx / 2               // row 1, col 1
   1927         };
   1928 
   1929         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   1930         {
   1931             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
   1932             {
   1933                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1934 
   1935                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   1936 
   1937                 ppDsts[0] += dx;
   1938                 ppDsts[1] += dx;
   1939                 ppDsts[2] += dx;
   1940                 ppDsts[3] += dx;
   1941             }
   1942 
   1943             ppDsts[0] += dy;
   1944             ppDsts[1] += dy;
   1945             ppDsts[2] += dy;
   1946             ppDsts[3] += dy;
   1947         }
   1948 #else
   1949         uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   1950             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   1951         uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
   1952 
   1953         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   1954         {
   1955             for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
   1956             {
   1957                 uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
   1958 
   1959                 uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
   1960                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   1961 
   1962                 // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   1963                 pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   1964             }
   1965 
   1966             pRow0 += (DestRowWidthBytes * 2);
   1967             pRow1 += (DestRowWidthBytes * 2);
   1968         }
   1969 #endif
   1970     }
   1971 };
   1972 
   1973 //////////////////////////////////////////////////////////////////////////
   1974 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
   1975 //////////////////////////////////////////////////////////////////////////
   1976 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   1977 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
   1978 {
   1979     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
   1980     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   1981 
   1982     //////////////////////////////////////////////////////////////////////////
   1983     /// @brief Stores an 8x8 raster tile to the destination surface.
   1984     /// @param pSrc - Pointer to raster tile.
   1985     /// @param pDstSurface - Destination surface state
   1986     /// @param x, y - Coordinates to raster tile.
   1987     INLINE static void Store(
   1988         uint8_t *pSrc,
   1989         SWR_SURFACE_STATE* pDstSurface,
   1990         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   1991     {
   1992         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   1993         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
   1994 
   1995         // Punt non-full tiles to generic store
   1996         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   1997         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   1998 
   1999         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   2000         {
   2001             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   2002         }
   2003 
   2004         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   2005         // We can compute the offsets to each column within the raster tile once and increment from these.
   2006 #if USE_8x2_TILE_BACKEND
   2007         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   2008         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2009             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2010 
   2011         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   2012         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   2013 
   2014         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   2015         uint8_t *ppDsts[] =
   2016         {
   2017             pDst,                                           // row 0, col 0
   2018             pDst + DestRowWidthBytes,                       // row 1, col 0
   2019             pDst + DestColumnBytes,                         // row 0, col 1
   2020             pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
   2021         };
   2022 
   2023         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   2024         {
   2025             // Raster tile width is same as simd16 tile width
   2026             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   2027 
   2028             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2029 
   2030             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   2031 
   2032             ppDsts[0] += dy;
   2033             ppDsts[1] += dy;
   2034             ppDsts[2] += dy;
   2035             ppDsts[3] += dy;
   2036         }
   2037 #else
   2038         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   2039         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2040             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2041 
   2042         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   2043         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   2044 
   2045         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   2046         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   2047         {
   2048             uint32_t rowOffset = row * DestRowWidthBytes;
   2049 
   2050             uint8_t* pRow = pCol0 + rowOffset;
   2051             uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
   2052 
   2053             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2054             pSrc += pSrcInc;
   2055 
   2056             ppDsts[0] += DestColumnBytes;
   2057             ppDsts[1] += DestColumnBytes;
   2058 
   2059             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2060             pSrc += pSrcInc;
   2061         }
   2062 #endif
   2063     }
   2064 };
   2065 
   2066 //////////////////////////////////////////////////////////////////////////
   2067 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
   2068 //////////////////////////////////////////////////////////////////////////
   2069 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   2070 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
   2071 {
   2072     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
   2073     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   2074 
   2075     //////////////////////////////////////////////////////////////////////////
   2076     /// @brief Stores an 8x8 raster tile to the destination surface.
   2077     /// @param pSrc - Pointer to raster tile.
   2078     /// @param pDstSurface - Destination surface state
   2079     /// @param x, y - Coordinates to raster tile.
   2080     INLINE static void Store(
   2081         uint8_t *pSrc,
   2082         SWR_SURFACE_STATE* pDstSurface,
   2083         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   2084     {
   2085         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   2086         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
   2087 
   2088         // Punt non-full tiles to generic store
   2089         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   2090         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   2091 
   2092         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   2093         {
   2094             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   2095         }
   2096 
   2097         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   2098         // We can compute the offsets to each column within the raster tile once and increment from these.
   2099 #if USE_8x2_TILE_BACKEND
   2100         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   2101         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2102             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2103 
   2104         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   2105         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   2106 
   2107         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   2108         uint8_t *ppDsts[] =
   2109         {
   2110             pDst,                                           // row 0, col 0
   2111             pDst + DestRowWidthBytes,                       // row 1, col 0
   2112             pDst + DestColumnBytes,                         // row 0, col 1
   2113             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
   2114             pDst + DestColumnBytes * 2,                     // row 0, col 2
   2115             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
   2116             pDst + DestColumnBytes * 3,                     // row 0, col 3
   2117             pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
   2118         };
   2119 
   2120         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   2121         {
   2122             // Raster tile width is same as simd16 tile width
   2123             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   2124 
   2125             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2126 
   2127             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   2128 
   2129             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
   2130             {
   2131                 ppDsts[i] += dy;
   2132             }
   2133         }
   2134 #else
   2135         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   2136         uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2137             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2138         uint8_t* pCol1 = pCol0 + DestColumnBytes;
   2139 
   2140         // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
   2141         // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
   2142         uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
   2143 
   2144         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   2145         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
   2146         {
   2147             uint32_t rowOffset = row * DestRowWidthBytes;
   2148             uint8_t* ppDsts[] =
   2149             {
   2150                 pCol0 + rowOffset,
   2151                 pCol0 + rowOffset + DestRowWidthBytes,
   2152                 pCol1 + rowOffset,
   2153                 pCol1 + rowOffset + DestRowWidthBytes,
   2154             };
   2155 
   2156             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2157             pSrc += pSrcInc;
   2158 
   2159             ppDsts[0] += DestColumnBytes * 2;
   2160             ppDsts[1] += DestColumnBytes * 2;
   2161             ppDsts[2] += DestColumnBytes * 2;
   2162             ppDsts[3] += DestColumnBytes * 2;
   2163 
   2164             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2165             pSrc += pSrcInc;
   2166         }
   2167 #endif
   2168     }
   2169 };
   2170 
   2171 //////////////////////////////////////////////////////////////////////////
   2172 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
   2173 //////////////////////////////////////////////////////////////////////////
   2174 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   2175 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
   2176 {
   2177     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
   2178 #if USE_8x2_TILE_BACKEND
   2179     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   2180 
   2181 #else
   2182     static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
   2183     static const size_t TILE_Y_ROWS = 32;
   2184     static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
   2185 
   2186     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
   2187     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
   2188     static const size_t MAX_DST_COLUMN_BYTES = 16;
   2189 
   2190     static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
   2191     static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
   2192 
   2193 #endif
   2194     //////////////////////////////////////////////////////////////////////////
   2195     /// @brief Stores an 8x8 raster tile to the destination surface.
   2196     /// @param pSrc - Pointer to raster tile.
   2197     /// @param pDstSurface - Destination surface state
   2198     /// @param x, y - Coordinates to raster tile.
   2199     INLINE static void Store(
   2200         uint8_t *pSrc,
   2201         SWR_SURFACE_STATE* pDstSurface,
   2202         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
   2203     {
   2204 #if USE_8x2_TILE_BACKEND
   2205         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
   2206         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
   2207 #endif
   2208 
   2209         // Punt non-full tiles to generic store
   2210         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
   2211         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
   2212 
   2213         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
   2214         {
   2215             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
   2216         }
   2217 
   2218         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
   2219         // We can compute the offsets to each column within the raster tile once and increment from these.
   2220 #if USE_8x2_TILE_BACKEND
   2221         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
   2222         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2223             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2224 
   2225         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
   2226         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
   2227 
   2228         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
   2229         uint8_t *ppDsts[] =
   2230         {
   2231             pDst,                                           // row 0, col 0
   2232             pDst + DestRowWidthBytes,                       // row 1, col 0
   2233             pDst + DestColumnBytes,                         // row 0, col 1
   2234             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
   2235             pDst + DestColumnBytes * 2,                     // row 0, col 2
   2236             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
   2237             pDst + DestColumnBytes * 3,                     // row 0, col 3
   2238             pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
   2239             pDst + DestColumnBytes * 4,                     // row 0, col 4
   2240             pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
   2241             pDst + DestColumnBytes * 5,                     // row 0, col 5
   2242             pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
   2243             pDst + DestColumnBytes * 6,                     // row 0, col 6
   2244             pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
   2245             pDst + DestColumnBytes * 7,                     // row 0, col 7
   2246             pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
   2247         };
   2248 
   2249         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
   2250         {
   2251             // Raster tile width is same as simd16 tile width
   2252             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
   2253 
   2254             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
   2255 
   2256             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
   2257 
   2258             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
   2259             {
   2260                 ppDsts[i] += dy;
   2261             }
   2262         }
   2263 #else
   2264         // There will be 8 4x2 simd tiles in an 8x8 raster tile.
   2265         uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
   2266             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
   2267         struct DstPtrs
   2268         {
   2269             uint8_t* ppDsts[8];
   2270         } ptrs;
   2271 
   2272         // Need 8 pointers, 4 columns of 2 rows each
   2273         for (uint32_t y = 0; y < 2; ++y)
   2274         {
   2275             for (uint32_t x = 0; x < 4; ++x)
   2276             {
   2277                 ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
   2278             }
   2279         }
   2280 
   2281         for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
   2282         {
   2283             DstPtrs startPtrs = ptrs;
   2284 
   2285             for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
   2286             {
   2287                 // Format conversion and convert from SOA to AOS, and store the rows.
   2288                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
   2289 
   2290                 ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
   2291                 ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
   2292                 ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
   2293                 ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
   2294                 ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
   2295                 ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
   2296                 ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
   2297                 ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
   2298                 pSrc += SRC_COLUMN_BYTES;
   2299             }
   2300 
   2301             ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2302             ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2303             ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2304             ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2305             ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2306             ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2307             ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2308             ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
   2309         }
   2310 #endif
   2311     }
   2312 };
   2313 
   2314 //////////////////////////////////////////////////////////////////////////
   2315 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
   2316 //////////////////////////////////////////////////////////////////////////
   2317 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
   2318 struct StoreMacroTile
   2319 {
   2320     //////////////////////////////////////////////////////////////////////////
   2321     /// @brief Stores a macrotile to the destination surface using safe implementation.
   2322     /// @param pSrc - Pointer to macro tile.
   2323     /// @param pDstSurface - Destination surface state
   2324     /// @param x, y - Coordinates to macro tile
   2325     static void StoreGeneric(
   2326         uint8_t *pSrcHotTile,
   2327         SWR_SURFACE_STATE* pDstSurface,
   2328         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
   2329     {
   2330         PFN_STORE_TILES_INTERNAL pfnStore;
   2331         pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
   2332 
   2333         // Store each raster tile from the hot tile to the destination surface.
   2334         for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
   2335         {
   2336             for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
   2337             {
   2338                 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
   2339                 {
   2340                     pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
   2341                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
   2342                 }
   2343             }
   2344         }
   2345 
   2346     }
   2347 
   2348     typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
   2349     //////////////////////////////////////////////////////////////////////////
   2350     /// @brief Stores a macrotile to the destination surface.
   2351     /// @param pSrc - Pointer to macro tile.
   2352     /// @param pDstSurface - Destination surface state
   2353     /// @param x, y - Coordinates to macro tile
   2354     static void Store(
   2355         uint8_t *pSrcHotTile,
   2356         SWR_SURFACE_STATE* pDstSurface,
   2357         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
   2358     {
   2359         PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
   2360 
   2361         for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
   2362         {
   2363             size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
   2364                 0,
   2365                 0,
   2366                 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
   2367                 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
   2368                 sampleNum,
   2369                 pDstSurface->lod,
   2370                 pDstSurface);
   2371 
   2372             // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
   2373             bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
   2374                 (pDstSurface->bInterleavedSamples);
   2375 
   2376             pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
   2377         }
   2378 
   2379         // Save original for pSrcHotTile resolve.
   2380         uint8_t *pResolveSrcHotTile = pSrcHotTile;
   2381 
   2382         // Store each raster tile from the hot tile to the destination surface.
   2383         for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
   2384         {
   2385             for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
   2386             {
   2387                 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
   2388                 {
   2389                     pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
   2390                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
   2391                 }
   2392             }
   2393         }
   2394 
   2395         if (pDstSurface->xpAuxBaseAddress)
   2396         {
   2397             uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
   2398             // Store each raster tile from the hot tile to the destination surface.
   2399             for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
   2400             {
   2401                 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
   2402                 {
   2403                     StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
   2404                     pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
   2405                 }
   2406             }
   2407         }
   2408     }
   2409 };
   2410 
   2411 //////////////////////////////////////////////////////////////////////////
   2412 /// InitStoreTilesTable - Helper for setting up the tables.
   2413 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
   2414 void InitStoreTilesTableColor_Half1(
   2415     PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
   2416 {
   2417     table[TTileMode][R32G32B32A32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
   2418     table[TTileMode][R32G32B32A32_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
   2419     table[TTileMode][R32G32B32A32_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
   2420     table[TTileMode][R32G32B32X32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
   2421     table[TTileMode][R32G32B32A32_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
   2422     table[TTileMode][R32G32B32A32_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
   2423     table[TTileMode][R32G32B32_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
   2424     table[TTileMode][R32G32B32_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
   2425     table[TTileMode][R32G32B32_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
   2426     table[TTileMode][R32G32B32_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
   2427     table[TTileMode][R32G32B32_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
   2428     table[TTileMode][R16G16B16A16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
   2429     table[TTileMode][R16G16B16A16_SNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
   2430     table[TTileMode][R16G16B16A16_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
   2431     table[TTileMode][R16G16B16A16_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
   2432     table[TTileMode][R16G16B16A16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
   2433     table[TTileMode][R32G32_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
   2434     table[TTileMode][R32G32_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
   2435     table[TTileMode][R32G32_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
   2436     table[TTileMode][R32_FLOAT_X8X24_TYPELESS]      = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
   2437     table[TTileMode][X32_TYPELESS_G8X24_UINT]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
   2438     table[TTileMode][R16G16B16X16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
   2439     table[TTileMode][R16G16B16X16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
   2440     table[TTileMode][R16G16B16A16_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
   2441     table[TTileMode][R16G16B16A16_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
   2442     table[TTileMode][R32G32_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
   2443     table[TTileMode][R32G32_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
   2444     table[TTileMode][B8G8R8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
   2445     table[TTileMode][B8G8R8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
   2446     table[TTileMode][R10G10B10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
   2447     table[TTileMode][R10G10B10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
   2448     table[TTileMode][R10G10B10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
   2449     table[TTileMode][R8G8B8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
   2450     table[TTileMode][R8G8B8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
   2451     table[TTileMode][R8G8B8A8_SNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
   2452     table[TTileMode][R8G8B8A8_SINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
   2453     table[TTileMode][R8G8B8A8_UINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
   2454     table[TTileMode][R16G16_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
   2455     table[TTileMode][R16G16_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
   2456     table[TTileMode][R16G16_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
   2457     table[TTileMode][R16G16_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
   2458     table[TTileMode][R16G16_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
   2459     table[TTileMode][B10G10R10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
   2460     table[TTileMode][B10G10R10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
   2461     table[TTileMode][R11G11B10_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
   2462     table[TTileMode][R10G10B10_FLOAT_A2_UNORM]      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
   2463     table[TTileMode][R32_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
   2464     table[TTileMode][R32_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
   2465     table[TTileMode][R32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
   2466     table[TTileMode][R24_UNORM_X8_TYPELESS]         = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
   2467     table[TTileMode][X24_TYPELESS_G8_UINT]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
   2468     table[TTileMode][A32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
   2469     table[TTileMode][B8G8R8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
   2470     table[TTileMode][B8G8R8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
   2471     table[TTileMode][R8G8B8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
   2472     table[TTileMode][R8G8B8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
   2473 }
   2474 
   2475 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
   2476 void InitStoreTilesTableColor_Half2(
   2477     PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
   2478 {
   2479     table[TTileMode][R9G9B9E5_SHAREDEXP]            = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
   2480     table[TTileMode][B10G10R10X2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
   2481     table[TTileMode][R10G10B10X2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
   2482     table[TTileMode][R8G8B8A8_SSCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
   2483     table[TTileMode][R8G8B8A8_USCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
   2484     table[TTileMode][R16G16_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
   2485     table[TTileMode][R16G16_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
   2486     table[TTileMode][R32_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
   2487     table[TTileMode][R32_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
   2488     table[TTileMode][B5G6R5_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
   2489     table[TTileMode][B5G6R5_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
   2490     table[TTileMode][B5G5R5A1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
   2491     table[TTileMode][B5G5R5A1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
   2492     table[TTileMode][B4G4R4A4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
   2493     table[TTileMode][B4G4R4A4_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
   2494     table[TTileMode][R8G8_UNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
   2495     table[TTileMode][R8G8_SNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
   2496     table[TTileMode][R8G8_SINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
   2497     table[TTileMode][R8G8_UINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
   2498     table[TTileMode][R16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
   2499     table[TTileMode][R16_SNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
   2500     table[TTileMode][R16_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
   2501     table[TTileMode][R16_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
   2502     table[TTileMode][R16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
   2503     table[TTileMode][A16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
   2504     table[TTileMode][A16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
   2505     table[TTileMode][B5G5R5X1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
   2506     table[TTileMode][B5G5R5X1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
   2507     table[TTileMode][R8G8_SSCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
   2508     table[TTileMode][R8G8_USCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
   2509     table[TTileMode][R16_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
   2510     table[TTileMode][R16_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
   2511     table[TTileMode][A1B5G5R5_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
   2512     table[TTileMode][A4B4G4R4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
   2513     table[TTileMode][R8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
   2514     table[TTileMode][R8_SNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
   2515     table[TTileMode][R8_SINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
   2516     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
   2517     table[TTileMode][A8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
   2518     table[TTileMode][R8_SSCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
   2519     table[TTileMode][R8_USCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
   2520     table[TTileMode][R8G8B8_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
   2521     table[TTileMode][R8G8B8_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
   2522     table[TTileMode][R8G8B8_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
   2523     table[TTileMode][R8G8B8_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
   2524     table[TTileMode][R16G16B16_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
   2525     table[TTileMode][R16G16B16_UNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
   2526     table[TTileMode][R16G16B16_SNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
   2527     table[TTileMode][R16G16B16_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
   2528     table[TTileMode][R16G16B16_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
   2529     table[TTileMode][R8G8B8_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
   2530     table[TTileMode][R16G16B16_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
   2531     table[TTileMode][R16G16B16_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
   2532     table[TTileMode][R10G10B10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
   2533     table[TTileMode][R10G10B10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
   2534     table[TTileMode][R10G10B10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
   2535     table[TTileMode][R10G10B10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
   2536     table[TTileMode][B10G10R10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
   2537     table[TTileMode][B10G10R10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
   2538     table[TTileMode][B10G10R10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
   2539     table[TTileMode][B10G10R10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
   2540     table[TTileMode][B10G10R10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
   2541     table[TTileMode][R8G8B8_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
   2542     table[TTileMode][R8G8B8_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
   2543 }
   2544 
   2545 //////////////////////////////////////////////////////////////////////////
   2546 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
   2547 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
   2548 void InitStoreTilesTableDepth(
   2549     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
   2550 {
   2551    table[TTileMode][R32_FLOAT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
   2552    table[TTileMode][R32_FLOAT_X8X24_TYPELESS]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
   2553    table[TTileMode][R24_UNORM_X8_TYPELESS]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
   2554    table[TTileMode][R16_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
   2555 }
   2556 
   2557 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
   2558 void InitStoreTilesTableStencil(
   2559     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
   2560 {
   2561     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
   2562 }
   2563