Home | History | Annotate | Download | only in renderer
      1 //
      2 // Copyright (c) 2002-2014 The ANGLE Project Authors. All rights reserved.
      3 // Use of this source code is governed by a BSD-style license that can be
      4 // found in the LICENSE file.
      5 //
      6 
      7 // loadimageSSE2.cpp: Defines image loading functions. It's
      8 // in a separated file for GCC, which can enable SSE usage only per-file,
      9 // not for code blocks that use SSE2 explicitly.
     10 
     11 #include "libGLESv2/renderer/loadimage.h"
     12 
     13 namespace rx
     14 {
     15 
     16 void LoadA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
     17                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
     18                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
     19 {
     20     __m128i zeroWide = _mm_setzero_si128();
     21 
     22     for (size_t z = 0; z < depth; z++)
     23     {
     24         for (size_t y = 0; y < height; y++)
     25         {
     26             const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
     27             uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
     28 
     29             size_t x = 0;
     30 
     31             // Make output writes aligned
     32             for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
     33             {
     34                 dest[x] = static_cast<uint32_t>(source[x]) << 24;
     35             }
     36 
     37             for (; x + 7 < width; x += 8)
     38             {
     39                 __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
     40                 // Interleave each byte to 16bit, make the lower byte to zero
     41                 sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
     42                 // Interleave each 16bit to 32bit, make the lower 16bit to zero
     43                 __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
     44                 __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
     45 
     46                 _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
     47                 _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
     48             }
     49 
     50             // Handle the remainder
     51             for (; x < width; x++)
     52             {
     53                 dest[x] = static_cast<uint32_t>(source[x]) << 24;
     54             }
     55         }
     56     }
     57 }
     58 
     59 void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
     60                            const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
     61                            uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
     62 {
     63     __m128i brMask = _mm_set1_epi32(0x00ff00ff);
     64 
     65     for (size_t z = 0; z < depth; z++)
     66     {
     67         for (size_t y = 0; y < height; y++)
     68         {
     69             const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
     70             uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
     71 
     72             size_t x = 0;
     73 
     74             // Make output writes aligned
     75             for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
     76             {
     77                 uint32_t rgba = source[x];
     78                 dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
     79             }
     80 
     81             for (; x + 3 < width; x += 4)
     82             {
     83                 __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
     84                 // Mask out g and a, which don't change
     85                 __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
     86                 // Mask out b and r
     87                 __m128i brComponents = _mm_and_si128(sourceData, brMask);
     88                 // Swap b and r
     89                 __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
     90                 __m128i result = _mm_or_si128(gaComponents, brSwapped);
     91                 _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
     92             }
     93 
     94             // Perform leftover writes
     95             for (; x < width; x++)
     96             {
     97                 uint32_t rgba = source[x];
     98                 dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
     99             }
    100         }
    101     }
    102 }
    103 
    104 }
    105