Home | History | Annotate | Download | only in i965
      1 /*
      2  * Mesa 3-D graphics library
      3  *
      4  * Copyright 2012 Intel Corporation
      5  * Copyright 2013 Google
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the
      9  * "Software"), to deal in the Software without restriction, including
     10  * without limitation the rights to use, copy, modify, merge, publish,
     11  * distribute, sublicense, and/or sell copies of the Software, and to
     12  * permit persons to whom the Software is furnished to do so, subject to
     13  * the following conditions:
     14  *
     15  * The above copyright notice and this permission notice (including the
     16  * next paragraph) shall be included in all copies or substantial portions
     17  * of the Software.
     18  *
     19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
     23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     26  *
     27  * Authors:
     28  *    Chad Versace <chad.versace (at) linux.intel.com>
     29  *    Frank Henigman <fjhenigman (at) google.com>
     30  */
     31 
     32 #include <string.h>
     33 
     34 #include "util/macros.h"
     35 
     36 #include "brw_context.h"
     37 #include "intel_tiled_memcpy.h"
     38 
     39 #if defined(__SSSE3__)
     40 #include <tmmintrin.h>
     41 #elif defined(__SSE2__)
     42 #include <emmintrin.h>
     43 #endif
     44 
     45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
     46 
     47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
     48 #define ALIGN_UP(a, b) ALIGN(a, b)
     49 
     50 /* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
     51  * unitless).  A "span" is the most number of bytes we can copy from linear
     52  * to tiled without needing to calculate a new destination address.
     53  */
     54 static const uint32_t xtile_width = 512;
     55 static const uint32_t xtile_height = 8;
     56 static const uint32_t xtile_span = 64;
     57 static const uint32_t ytile_width = 128;
     58 static const uint32_t ytile_height = 32;
     59 static const uint32_t ytile_span = 16;
     60 
     61 static inline uint32_t
     62 ror(uint32_t n, uint32_t d)
     63 {
     64    return (n >> d) | (n << (32 - d));
     65 }
     66 
     67 static inline uint32_t
     68 bswap32(uint32_t n)
     69 {
     70 #if defined(HAVE___BUILTIN_BSWAP32)
     71    return __builtin_bswap32(n);
     72 #else
     73    return (n >> 24) |
     74           ((n >> 8) & 0x0000ff00) |
     75           ((n << 8) & 0x00ff0000) |
     76           (n << 24);
     77 #endif
     78 }
     79 
     80 /**
     81  * Copy RGBA to BGRA - swap R and B.
     82  */
     83 static inline void *
     84 rgba8_copy(void *dst, const void *src, size_t bytes)
     85 {
     86    uint32_t *d = dst;
     87    uint32_t const *s = src;
     88 
     89    assert(bytes % 4 == 0);
     90 
     91    while (bytes >= 4) {
     92       *d = ror(bswap32(*s), 8);
     93       d += 1;
     94       s += 1;
     95       bytes -= 4;
     96    }
     97    return dst;
     98 }
     99 
    100 #ifdef __SSSE3__
    101 static const uint8_t rgba8_permutation[16] =
    102    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
    103 
    104 static inline void
    105 rgba8_copy_16_aligned_dst(void *dst, const void *src)
    106 {
    107    _mm_store_si128(dst,
    108                    _mm_shuffle_epi8(_mm_loadu_si128(src),
    109                                     *(__m128i *)rgba8_permutation));
    110 }
    111 
    112 static inline void
    113 rgba8_copy_16_aligned_src(void *dst, const void *src)
    114 {
    115    _mm_storeu_si128(dst,
    116                     _mm_shuffle_epi8(_mm_load_si128(src),
    117                                      *(__m128i *)rgba8_permutation));
    118 }
    119 
    120 #elif defined(__SSE2__)
    121 static inline void
    122 rgba8_copy_16_aligned_dst(void *dst, const void *src)
    123 {
    124    __m128i srcreg, dstreg, agmask, ag, rb, br;
    125 
    126    agmask = _mm_set1_epi32(0xFF00FF00);
    127    srcreg = _mm_loadu_si128((__m128i *)src);
    128 
    129    rb = _mm_andnot_si128(agmask, srcreg);
    130    ag = _mm_and_si128(agmask, srcreg);
    131    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
    132                             _MM_SHUFFLE(2, 3, 0, 1));
    133    dstreg = _mm_or_si128(ag, br);
    134 
    135    _mm_store_si128((__m128i *)dst, dstreg);
    136 }
    137 
    138 static inline void
    139 rgba8_copy_16_aligned_src(void *dst, const void *src)
    140 {
    141    __m128i srcreg, dstreg, agmask, ag, rb, br;
    142 
    143    agmask = _mm_set1_epi32(0xFF00FF00);
    144    srcreg = _mm_load_si128((__m128i *)src);
    145 
    146    rb = _mm_andnot_si128(agmask, srcreg);
    147    ag = _mm_and_si128(agmask, srcreg);
    148    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
    149                             _MM_SHUFFLE(2, 3, 0, 1));
    150    dstreg = _mm_or_si128(ag, br);
    151 
    152    _mm_storeu_si128((__m128i *)dst, dstreg);
    153 }
    154 #endif
    155 
    156 /**
    157  * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
    158  */
    159 static inline void *
    160 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
    161 {
    162    assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
    163 
    164 #if defined(__SSSE3__) || defined(__SSE2__)
    165    if (bytes == 64) {
    166       rgba8_copy_16_aligned_dst(dst +  0, src +  0);
    167       rgba8_copy_16_aligned_dst(dst + 16, src + 16);
    168       rgba8_copy_16_aligned_dst(dst + 32, src + 32);
    169       rgba8_copy_16_aligned_dst(dst + 48, src + 48);
    170       return dst;
    171    }
    172 
    173    while (bytes >= 16) {
    174       rgba8_copy_16_aligned_dst(dst, src);
    175       src += 16;
    176       dst += 16;
    177       bytes -= 16;
    178    }
    179 #endif
    180 
    181    rgba8_copy(dst, src, bytes);
    182 
    183    return dst;
    184 }
    185 
    186 /**
    187  * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
    188  */
    189 static inline void *
    190 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
    191 {
    192    assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
    193 
    194 #if defined(__SSSE3__) || defined(__SSE2__)
    195    if (bytes == 64) {
    196       rgba8_copy_16_aligned_src(dst +  0, src +  0);
    197       rgba8_copy_16_aligned_src(dst + 16, src + 16);
    198       rgba8_copy_16_aligned_src(dst + 32, src + 32);
    199       rgba8_copy_16_aligned_src(dst + 48, src + 48);
    200       return dst;
    201    }
    202 
    203    while (bytes >= 16) {
    204       rgba8_copy_16_aligned_src(dst, src);
    205       src += 16;
    206       dst += 16;
    207       bytes -= 16;
    208    }
    209 #endif
    210 
    211    rgba8_copy(dst, src, bytes);
    212 
    213    return dst;
    214 }
    215 
    216 /**
    217  * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
    218  * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
    219  * The first and last ranges must be shorter than a "span" (the longest linear
    220  * stretch within a tile) and the middle must equal a whole number of spans.
    221  * Ranges may be empty.  The region copied must land entirely within one tile.
    222  * 'dst' is the start of the tile and 'src' is the corresponding
    223  * address to copy from, though copying begins at (x0, y0).
    224  * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
    225  * Swizzling flips bit 6 in the copy destination offset, when certain other
    226  * bits are set in it.
    227  */
    228 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    229                              uint32_t y0, uint32_t y1,
    230                              char *dst, const char *src,
    231                              int32_t linear_pitch,
    232                              uint32_t swizzle_bit,
    233                              mem_copy_fn mem_copy);
    234 
    235 /**
    236  * Copy texture data from linear to X tile layout.
    237  *
    238  * \copydoc tile_copy_fn
    239  *
    240  * The mem_copy parameters allow the user to specify an alternative mem_copy
    241  * function that, for instance, may do RGBA -> BGRA swizzling.  The first
    242  * function must handle any memory alignment while the second function must
    243  * only handle 16-byte alignment in whichever side (source or destination) is
    244  * tiled.
    245  */
    246 static inline void
    247 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    248                  uint32_t y0, uint32_t y1,
    249                  char *dst, const char *src,
    250                  int32_t src_pitch,
    251                  uint32_t swizzle_bit,
    252                  mem_copy_fn mem_copy,
    253                  mem_copy_fn mem_copy_align16)
    254 {
    255    /* The copy destination offset for each range copied is the sum of
    256     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
    257     */
    258    uint32_t xo, yo;
    259 
    260    src += (ptrdiff_t)y0 * src_pitch;
    261 
    262    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
    263       /* Bits 9 and 10 of the copy destination offset control swizzling.
    264        * Only 'yo' contributes to those bits in the total offset,
    265        * so calculate 'swizzle' just once per row.
    266        * Move bits 9 and 10 three and four places respectively down
    267        * to bit 6 and xor them.
    268        */
    269       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
    270 
    271       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
    272 
    273       for (xo = x1; xo < x2; xo += xtile_span) {
    274          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
    275       }
    276 
    277       mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
    278 
    279       src += src_pitch;
    280    }
    281 }
    282 
    283 /**
    284  * Copy texture data from linear to Y tile layout.
    285  *
    286  * \copydoc tile_copy_fn
    287  */
    288 static inline void
    289 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    290                  uint32_t y0, uint32_t y1,
    291                  char *dst, const char *src,
    292                  int32_t src_pitch,
    293                  uint32_t swizzle_bit,
    294                  mem_copy_fn mem_copy,
    295                  mem_copy_fn mem_copy_align16)
    296 {
    297    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
    298     * as the tile).  Thus the destination offset for (x,y) is the sum of:
    299     *   (x % column_width)                    // position within column
    300     *   (x / column_width) * bytes_per_column // column number * bytes per column
    301     *   y * column_width
    302     *
    303     * The copy destination offset for each range copied is the sum of
    304     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
    305     */
    306    const uint32_t column_width = ytile_span;
    307    const uint32_t bytes_per_column = column_width * ytile_height;
    308 
    309    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
    310    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
    311 
    312    /* Bit 9 of the destination offset control swizzling.
    313     * Only the X offset contributes to bit 9 of the total offset,
    314     * so swizzle can be calculated in advance for these X positions.
    315     * Move bit 9 three places down to bit 6.
    316     */
    317    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
    318    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
    319 
    320    uint32_t x, yo;
    321 
    322    src += (ptrdiff_t)y0 * src_pitch;
    323 
    324    for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
    325       uint32_t xo = xo1;
    326       uint32_t swizzle = swizzle1;
    327 
    328       mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
    329 
    330       /* Step by spans/columns.  As it happens, the swizzle bit flips
    331        * at each step so we don't need to calculate it explicitly.
    332        */
    333       for (x = x1; x < x2; x += ytile_span) {
    334          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
    335          xo += bytes_per_column;
    336          swizzle ^= swizzle_bit;
    337       }
    338 
    339       mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
    340 
    341       src += src_pitch;
    342    }
    343 }
    344 
    345 /**
    346  * Copy texture data from X tile layout to linear.
    347  *
    348  * \copydoc tile_copy_fn
    349  */
    350 static inline void
    351 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    352                  uint32_t y0, uint32_t y1,
    353                  char *dst, const char *src,
    354                  int32_t dst_pitch,
    355                  uint32_t swizzle_bit,
    356                  mem_copy_fn mem_copy,
    357                  mem_copy_fn mem_copy_align16)
    358 {
    359    /* The copy destination offset for each range copied is the sum of
    360     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
    361     */
    362    uint32_t xo, yo;
    363 
    364    dst += (ptrdiff_t)y0 * dst_pitch;
    365 
    366    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
    367       /* Bits 9 and 10 of the copy destination offset control swizzling.
    368        * Only 'yo' contributes to those bits in the total offset,
    369        * so calculate 'swizzle' just once per row.
    370        * Move bits 9 and 10 three and four places respectively down
    371        * to bit 6 and xor them.
    372        */
    373       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
    374 
    375       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
    376 
    377       for (xo = x1; xo < x2; xo += xtile_span) {
    378          mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
    379       }
    380 
    381       mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
    382 
    383       dst += dst_pitch;
    384    }
    385 }
    386 
    387  /**
    388  * Copy texture data from Y tile layout to linear.
    389  *
    390  * \copydoc tile_copy_fn
    391  */
    392 static inline void
    393 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    394                  uint32_t y0, uint32_t y1,
    395                  char *dst, const char *src,
    396                  int32_t dst_pitch,
    397                  uint32_t swizzle_bit,
    398                  mem_copy_fn mem_copy,
    399                  mem_copy_fn mem_copy_align16)
    400 {
    401    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
    402     * as the tile).  Thus the destination offset for (x,y) is the sum of:
    403     *   (x % column_width)                    // position within column
    404     *   (x / column_width) * bytes_per_column // column number * bytes per column
    405     *   y * column_width
    406     *
    407     * The copy destination offset for each range copied is the sum of
    408     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
    409     */
    410    const uint32_t column_width = ytile_span;
    411    const uint32_t bytes_per_column = column_width * ytile_height;
    412 
    413    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
    414    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
    415 
    416    /* Bit 9 of the destination offset control swizzling.
    417     * Only the X offset contributes to bit 9 of the total offset,
    418     * so swizzle can be calculated in advance for these X positions.
    419     * Move bit 9 three places down to bit 6.
    420     */
    421    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
    422    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
    423 
    424    uint32_t x, yo;
    425 
    426    dst += (ptrdiff_t)y0 * dst_pitch;
    427 
    428    for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
    429       uint32_t xo = xo1;
    430       uint32_t swizzle = swizzle1;
    431 
    432       mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
    433 
    434       /* Step by spans/columns.  As it happens, the swizzle bit flips
    435        * at each step so we don't need to calculate it explicitly.
    436        */
    437       for (x = x1; x < x2; x += ytile_span) {
    438          mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
    439          xo += bytes_per_column;
    440          swizzle ^= swizzle_bit;
    441       }
    442 
    443       mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
    444 
    445       dst += dst_pitch;
    446    }
    447 }
    448 
    449 
    450 /**
    451  * Copy texture data from linear to X tile layout, faster.
    452  *
    453  * Same as \ref linear_to_xtiled but faster, because it passes constant
    454  * parameters for common cases, allowing the compiler to inline code
    455  * optimized for those cases.
    456  *
    457  * \copydoc tile_copy_fn
    458  */
    459 static FLATTEN void
    460 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    461                         uint32_t y0, uint32_t y1,
    462                         char *dst, const char *src,
    463                         int32_t src_pitch,
    464                         uint32_t swizzle_bit,
    465                         mem_copy_fn mem_copy)
    466 {
    467    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
    468       if (mem_copy == memcpy)
    469          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
    470                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
    471       else if (mem_copy == rgba8_copy)
    472          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
    473                                  dst, src, src_pitch, swizzle_bit,
    474                                  rgba8_copy, rgba8_copy_aligned_dst);
    475       else
    476          unreachable("not reached");
    477    } else {
    478       if (mem_copy == memcpy)
    479          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
    480                                  dst, src, src_pitch, swizzle_bit,
    481                                  memcpy, memcpy);
    482       else if (mem_copy == rgba8_copy)
    483          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
    484                                  dst, src, src_pitch, swizzle_bit,
    485                                  rgba8_copy, rgba8_copy_aligned_dst);
    486       else
    487          unreachable("not reached");
    488    }
    489    linear_to_xtiled(x0, x1, x2, x3, y0, y1,
    490                     dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
    491 }
    492 
    493 /**
    494  * Copy texture data from linear to Y tile layout, faster.
    495  *
    496  * Same as \ref linear_to_ytiled but faster, because it passes constant
    497  * parameters for common cases, allowing the compiler to inline code
    498  * optimized for those cases.
    499  *
    500  * \copydoc tile_copy_fn
    501  */
    502 static FLATTEN void
    503 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    504                         uint32_t y0, uint32_t y1,
    505                         char *dst, const char *src,
    506                         int32_t src_pitch,
    507                         uint32_t swizzle_bit,
    508                         mem_copy_fn mem_copy)
    509 {
    510    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
    511       if (mem_copy == memcpy)
    512          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
    513                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
    514       else if (mem_copy == rgba8_copy)
    515          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
    516                                  dst, src, src_pitch, swizzle_bit,
    517                                  rgba8_copy, rgba8_copy_aligned_dst);
    518       else
    519          unreachable("not reached");
    520    } else {
    521       if (mem_copy == memcpy)
    522          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
    523                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
    524       else if (mem_copy == rgba8_copy)
    525          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
    526                                  dst, src, src_pitch, swizzle_bit,
    527                                  rgba8_copy, rgba8_copy_aligned_dst);
    528       else
    529          unreachable("not reached");
    530    }
    531    linear_to_ytiled(x0, x1, x2, x3, y0, y1,
    532                     dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
    533 }
    534 
    535 /**
    536  * Copy texture data from X tile layout to linear, faster.
    537  *
    538  * Same as \ref xtile_to_linear but faster, because it passes constant
    539  * parameters for common cases, allowing the compiler to inline code
    540  * optimized for those cases.
    541  *
    542  * \copydoc tile_copy_fn
    543  */
    544 static FLATTEN void
    545 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    546                         uint32_t y0, uint32_t y1,
    547                         char *dst, const char *src,
    548                         int32_t dst_pitch,
    549                         uint32_t swizzle_bit,
    550                         mem_copy_fn mem_copy)
    551 {
    552    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
    553       if (mem_copy == memcpy)
    554          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
    555                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
    556       else if (mem_copy == rgba8_copy)
    557          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
    558                                  dst, src, dst_pitch, swizzle_bit,
    559                                  rgba8_copy, rgba8_copy_aligned_src);
    560       else
    561          unreachable("not reached");
    562    } else {
    563       if (mem_copy == memcpy)
    564          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
    565                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
    566       else if (mem_copy == rgba8_copy)
    567          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
    568                                  dst, src, dst_pitch, swizzle_bit,
    569                                  rgba8_copy, rgba8_copy_aligned_src);
    570       else
    571          unreachable("not reached");
    572    }
    573    xtiled_to_linear(x0, x1, x2, x3, y0, y1,
    574                     dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
    575 }
    576 
    577 /**
    578  * Copy texture data from Y tile layout to linear, faster.
    579  *
    580  * Same as \ref ytile_to_linear but faster, because it passes constant
    581  * parameters for common cases, allowing the compiler to inline code
    582  * optimized for those cases.
    583  *
    584  * \copydoc tile_copy_fn
    585  */
    586 static FLATTEN void
    587 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    588                         uint32_t y0, uint32_t y1,
    589                         char *dst, const char *src,
    590                         int32_t dst_pitch,
    591                         uint32_t swizzle_bit,
    592                         mem_copy_fn mem_copy)
    593 {
    594    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
    595       if (mem_copy == memcpy)
    596          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
    597                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
    598       else if (mem_copy == rgba8_copy)
    599          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
    600                                  dst, src, dst_pitch, swizzle_bit,
    601                                  rgba8_copy, rgba8_copy_aligned_src);
    602       else
    603          unreachable("not reached");
    604    } else {
    605       if (mem_copy == memcpy)
    606          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
    607                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
    608       else if (mem_copy == rgba8_copy)
    609          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
    610                                  dst, src, dst_pitch, swizzle_bit,
    611                                  rgba8_copy, rgba8_copy_aligned_src);
    612       else
    613          unreachable("not reached");
    614    }
    615    ytiled_to_linear(x0, x1, x2, x3, y0, y1,
    616                     dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
    617 }
    618 
    619 /**
    620  * Copy from linear to tiled texture.
    621  *
    622  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
    623  * pieces that do not cross tile boundaries and copy each piece with a tile
    624  * copy function (\ref tile_copy_fn).
    625  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
    626  * The Y range is in pixels (i.e. unitless).
    627  * 'dst' is the start of the texture and 'src' is the corresponding
    628  * address to copy from, though copying begins at (xt1, yt1).
    629  */
    630 void
    631 linear_to_tiled(uint32_t xt1, uint32_t xt2,
    632                 uint32_t yt1, uint32_t yt2,
    633                 char *dst, const char *src,
    634                 uint32_t dst_pitch, int32_t src_pitch,
    635                 bool has_swizzling,
    636                 uint32_t tiling,
    637                 mem_copy_fn mem_copy)
    638 {
    639    tile_copy_fn tile_copy;
    640    uint32_t xt0, xt3;
    641    uint32_t yt0, yt3;
    642    uint32_t xt, yt;
    643    uint32_t tw, th, span;
    644    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
    645 
    646    if (tiling == I915_TILING_X) {
    647       tw = xtile_width;
    648       th = xtile_height;
    649       span = xtile_span;
    650       tile_copy = linear_to_xtiled_faster;
    651    } else if (tiling == I915_TILING_Y) {
    652       tw = ytile_width;
    653       th = ytile_height;
    654       span = ytile_span;
    655       tile_copy = linear_to_ytiled_faster;
    656    } else {
    657       unreachable("unsupported tiling");
    658    }
    659 
    660    /* Round out to tile boundaries. */
    661    xt0 = ALIGN_DOWN(xt1, tw);
    662    xt3 = ALIGN_UP  (xt2, tw);
    663    yt0 = ALIGN_DOWN(yt1, th);
    664    yt3 = ALIGN_UP  (yt2, th);
    665 
    666    /* Loop over all tiles to which we have something to copy.
    667     * 'xt' and 'yt' are the origin of the destination tile, whether copying
    668     * copying a full or partial tile.
    669     * tile_copy() copies one tile or partial tile.
    670     * Looping x inside y is the faster memory access pattern.
    671     */
    672    for (yt = yt0; yt < yt3; yt += th) {
    673       for (xt = xt0; xt < xt3; xt += tw) {
    674          /* The area to update is [x0,x3) x [y0,y1).
    675           * May not want the whole tile, hence the min and max.
    676           */
    677          uint32_t x0 = MAX2(xt1, xt);
    678          uint32_t y0 = MAX2(yt1, yt);
    679          uint32_t x3 = MIN2(xt2, xt + tw);
    680          uint32_t y1 = MIN2(yt2, yt + th);
    681 
    682          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
    683           * the middle interval is the longest span-aligned part.
    684           * The sub-ranges could be empty.
    685           */
    686          uint32_t x1, x2;
    687          x1 = ALIGN_UP(x0, span);
    688          if (x1 > x3)
    689             x1 = x2 = x3;
    690          else
    691             x2 = ALIGN_DOWN(x3, span);
    692 
    693          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
    694          assert(x1 - x0 < span && x3 - x2 < span);
    695          assert(x3 - x0 <= tw);
    696          assert((x2 - x1) % span == 0);
    697 
    698          /* Translate by (xt,yt) for single-tile copier. */
    699          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
    700                    y0-yt, y1-yt,
    701                    dst + (ptrdiff_t) xt * th + (ptrdiff_t) yt * dst_pitch,
    702                    src + (ptrdiff_t) xt      + (ptrdiff_t) yt * src_pitch,
    703                    src_pitch,
    704                    swizzle_bit,
    705                    mem_copy);
    706       }
    707    }
    708 }
    709 
    710 /**
    711  * Copy from tiled to linear texture.
    712  *
    713  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
    714  * pieces that do not cross tile boundaries and copy each piece with a tile
    715  * copy function (\ref tile_copy_fn).
    716  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
    717  * The Y range is in pixels (i.e. unitless).
    718  * 'dst' is the start of the texture and 'src' is the corresponding
    719  * address to copy from, though copying begins at (xt1, yt1).
    720  */
    721 void
    722 tiled_to_linear(uint32_t xt1, uint32_t xt2,
    723                 uint32_t yt1, uint32_t yt2,
    724                 char *dst, const char *src,
    725                 int32_t dst_pitch, uint32_t src_pitch,
    726                 bool has_swizzling,
    727                 uint32_t tiling,
    728                 mem_copy_fn mem_copy)
    729 {
    730    tile_copy_fn tile_copy;
    731    uint32_t xt0, xt3;
    732    uint32_t yt0, yt3;
    733    uint32_t xt, yt;
    734    uint32_t tw, th, span;
    735    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
    736 
    737    if (tiling == I915_TILING_X) {
    738       tw = xtile_width;
    739       th = xtile_height;
    740       span = xtile_span;
    741       tile_copy = xtiled_to_linear_faster;
    742    } else if (tiling == I915_TILING_Y) {
    743       tw = ytile_width;
    744       th = ytile_height;
    745       span = ytile_span;
    746       tile_copy = ytiled_to_linear_faster;
    747    } else {
    748       unreachable("unsupported tiling");
    749    }
    750 
    751    /* Round out to tile boundaries. */
    752    xt0 = ALIGN_DOWN(xt1, tw);
    753    xt3 = ALIGN_UP  (xt2, tw);
    754    yt0 = ALIGN_DOWN(yt1, th);
    755    yt3 = ALIGN_UP  (yt2, th);
    756 
    757    /* Loop over all tiles to which we have something to copy.
    758     * 'xt' and 'yt' are the origin of the destination tile, whether copying
    759     * copying a full or partial tile.
    760     * tile_copy() copies one tile or partial tile.
    761     * Looping x inside y is the faster memory access pattern.
    762     */
    763    for (yt = yt0; yt < yt3; yt += th) {
    764       for (xt = xt0; xt < xt3; xt += tw) {
    765          /* The area to update is [x0,x3) x [y0,y1).
    766           * May not want the whole tile, hence the min and max.
    767           */
    768          uint32_t x0 = MAX2(xt1, xt);
    769          uint32_t y0 = MAX2(yt1, yt);
    770          uint32_t x3 = MIN2(xt2, xt + tw);
    771          uint32_t y1 = MIN2(yt2, yt + th);
    772 
    773          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
    774           * the middle interval is the longest span-aligned part.
    775           * The sub-ranges could be empty.
    776           */
    777          uint32_t x1, x2;
    778          x1 = ALIGN_UP(x0, span);
    779          if (x1 > x3)
    780             x1 = x2 = x3;
    781          else
    782             x2 = ALIGN_DOWN(x3, span);
    783 
    784          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
    785          assert(x1 - x0 < span && x3 - x2 < span);
    786          assert(x3 - x0 <= tw);
    787          assert((x2 - x1) % span == 0);
    788 
    789          /* Translate by (xt,yt) for single-tile copier. */
    790          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
    791                    y0-yt, y1-yt,
    792                    dst + (ptrdiff_t) xt      + (ptrdiff_t) yt * dst_pitch,
    793                    src + (ptrdiff_t) xt * th + (ptrdiff_t) yt * src_pitch,
    794                    dst_pitch,
    795                    swizzle_bit,
    796                    mem_copy);
    797       }
    798    }
    799 }
    800 
    801 
    802 /**
    803  * Determine which copy function to use for the given format combination
    804  *
    805  * The only two possible copy functions which are ever returned are a
    806  * direct memcpy and a RGBA <-> BGRA copy function.  Since RGBA -> BGRA and
    807  * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
    808  * symmetric), it doesn't matter whether the copy is from the tiled image
    809  * to the untiled or vice versa.  The copy function required is the same in
    810  * either case so this function can be used.
    811  *
    812  * \param[in]  tiledFormat The format of the tiled image
    813  * \param[in]  format      The GL format of the client data
    814  * \param[in]  type        The GL type of the client data
    815  * \param[out] mem_copy    Will be set to one of either the standard
    816  *                         library's memcpy or a different copy function
    817  *                         that performs an RGBA to BGRA conversion
    818  * \param[out] cpp         Number of bytes per channel
    819  *
    820  * \return true if the format and type combination are valid
    821  */
    822 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
    823                       GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp)
    824 {
    825    if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
    826        !(format == GL_RGBA || format == GL_BGRA))
    827       return false; /* Invalid type/format combination */
    828 
    829    if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
    830        (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
    831       *cpp = 1;
    832       *mem_copy = memcpy;
    833    } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
    834               (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
    835               (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
    836               (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
    837       *cpp = 4;
    838       if (format == GL_BGRA) {
    839          *mem_copy = memcpy;
    840       } else if (format == GL_RGBA) {
    841          *mem_copy = rgba8_copy;
    842       }
    843    } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
    844               (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
    845               (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
    846               (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
    847       *cpp = 4;
    848       if (format == GL_BGRA) {
    849          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
    850           * use the same function.
    851           */
    852          *mem_copy = rgba8_copy;
    853       } else if (format == GL_RGBA) {
    854          *mem_copy = memcpy;
    855       }
    856    }
    857 
    858    if (!(*mem_copy))
    859       return false;
    860 
    861    return true;
    862 }
    863