Home | History | Annotate | Download | only in arm
      1 /*
      2  * Copyright (C) 2012 Gabor Rapcsanyi (rgabor (at) inf.u-szeged.hu), University of Szeged
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef GraphicsContext3DNEON_h
     27 #define GraphicsContext3DNEON_h
     28 
     29 #if HAVE(ARM_NEON_INTRINSICS)
     30 
     31 #include <arm_neon.h>
     32 
     33 namespace WebCore {
     34 
     35 namespace SIMD {
     36 
     37 ALWAYS_INLINE void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
     38 {
     39     unsigned componentsPerRow = pixelsPerRow * 4;
     40     unsigned tailComponents = componentsPerRow % 16;
     41     unsigned componentsSize = componentsPerRow - tailComponents;
     42     const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
     43 
     44     for (unsigned i = 0; i < componentsSize; i += 16) {
     45         uint8x16x2_t components = vld2q_u8(src + i * 2);
     46         vst1q_u8(destination + i, components.val[1]);
     47     }
     48 
     49     source += componentsSize;
     50     destination += componentsSize;
     51     pixelsPerRow = tailComponents / 4;
     52 }
     53 
     54 ALWAYS_INLINE void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
     55 {
     56     unsigned componentsPerRow = pixelsPerRow * 3;
     57     unsigned tailComponents = componentsPerRow % 24;
     58     unsigned componentsSize = componentsPerRow - tailComponents;
     59 
     60     uint8x8_t componentA = vdup_n_u8(0xFF);
     61     for (unsigned i = 0; i < componentsSize; i += 24) {
     62         uint16x8x3_t RGB16 = vld3q_u16(source + i);
     63         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(RGB16.val[0], 8));
     64         uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(RGB16.val[1], 8));
     65         uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(RGB16.val[2], 8));
     66         uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
     67         vst4_u8(destination, RGBA8);
     68         destination += 32;
     69     }
     70 
     71     source += componentsSize;
     72     pixelsPerRow = tailComponents / 3;
     73 }
     74 
     75 ALWAYS_INLINE void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
     76 {
     77     unsigned componentsPerRow = pixelsPerRow * 4;
     78     unsigned tailComponents = componentsPerRow % 32;
     79     unsigned componentsSize = componentsPerRow - tailComponents;
     80 
     81     for (unsigned i = 0; i < componentsSize; i += 32) {
     82         uint16x8x4_t ARGB16 = vld4q_u16(source + i);
     83         uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
     84         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
     85         uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
     86         uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
     87         uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
     88         vst4_u8(destination + i, RGBA8);
     89     }
     90 
     91     source += componentsSize;
     92     destination += componentsSize;
     93     pixelsPerRow = tailComponents / 4;
     94 }
     95 
     96 ALWAYS_INLINE void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
     97 {
     98     unsigned componentsPerRow = pixelsPerRow * 4;
     99     unsigned tailComponents = componentsPerRow % 32;
    100     unsigned componentsSize = componentsPerRow - tailComponents;
    101 
    102     for (unsigned i = 0; i < componentsSize; i += 32) {
    103         uint16x8x4_t ARGB16 = vld4q_u16(source + i);
    104         uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
    105         uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
    106         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
    107         uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
    108         uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
    109         vst4_u8(destination + i, RGBA8);
    110     }
    111 
    112     source += componentsSize;
    113     destination += componentsSize;
    114     pixelsPerRow = tailComponents / 4;
    115 }
    116 
    117 ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
    118 {
    119     unsigned tailPixels = pixelsPerRow % 8;
    120     unsigned pixelSize = pixelsPerRow - tailPixels;
    121 
    122     uint16x8_t immediate0x0f = vdupq_n_u16(0x0F);
    123     for (unsigned i = 0; i < pixelSize; i += 8) {
    124         uint16x8_t eightPixels = vld1q_u16(source + i);
    125 
    126         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 12));
    127         uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 8), immediate0x0f));
    128         uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 4), immediate0x0f));
    129         uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x0f));
    130 
    131         componentR = vorr_u8(vshl_n_u8(componentR, 4), componentR);
    132         componentG = vorr_u8(vshl_n_u8(componentG, 4), componentG);
    133         componentB = vorr_u8(vshl_n_u8(componentB, 4), componentB);
    134         componentA = vorr_u8(vshl_n_u8(componentA, 4), componentA);
    135 
    136         uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
    137         vst4_u8(destination, destComponents);
    138         destination += 32;
    139     }
    140 
    141     source += pixelSize;
    142     pixelsPerRow = tailPixels;
    143 }
    144 
    145 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort4444(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
    146 {
    147     unsigned componentsPerRow = pixelsPerRow * 4;
    148     unsigned tailComponents = componentsPerRow % 32;
    149     unsigned componentsSize = componentsPerRow - tailComponents;
    150 
    151     uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
    152     uint8x8_t immediate0xf0 = vdup_n_u8(0xF0);
    153     for (unsigned i = 0; i < componentsSize; i += 32) {
    154         uint8x8x4_t RGBA8 = vld4_u8(source + i);
    155 
    156         uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf0);
    157         uint8x8_t componentG = vshr_n_u8(vand_u8(RGBA8.val[1], immediate0xf0), 4);
    158         uint8x8_t componentB = vand_u8(RGBA8.val[2], immediate0xf0);
    159         uint8x8_t componentA = vshr_n_u8(vand_u8(RGBA8.val[3], immediate0xf0), 4);
    160 
    161         uint8x8x2_t RGBA4;
    162         RGBA4.val[0] = vorr_u8(componentB, componentA);
    163         RGBA4.val[1] = vorr_u8(componentR, componentG);
    164         vst2_u8(dst, RGBA4);
    165         dst += 16;
    166     }
    167 
    168     source += componentsSize;
    169     destination += componentsSize / 4;
    170     pixelsPerRow = tailComponents / 4;
    171 }
    172 
    173 ALWAYS_INLINE void unpackOneRowOfRGBA5551ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
    174 {
    175     unsigned tailPixels = pixelsPerRow % 8;
    176     unsigned pixelSize = pixelsPerRow - tailPixels;
    177 
    178     uint8x8_t immediate0x7 = vdup_n_u8(0x7);
    179     uint8x8_t immediate0xff = vdup_n_u8(0xFF);
    180     uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
    181     uint16x8_t immediate0x1 = vdupq_n_u16(0x1);
    182 
    183     for (unsigned i = 0; i < pixelSize; i += 8) {
    184         uint16x8_t eightPixels = vld1q_u16(source + i);
    185 
    186         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
    187         uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 6), immediate0x1f));
    188         uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 1), immediate0x1f));
    189         uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x1));
    190 
    191         componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
    192         componentG = vorr_u8(vshl_n_u8(componentG, 3), vand_u8(componentG, immediate0x7));
    193         componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
    194         componentA = vmul_u8(componentA, immediate0xff);
    195 
    196         uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
    197         vst4_u8(destination, destComponents);
    198         destination += 32;
    199     }
    200 
    201     source += pixelSize;
    202     pixelsPerRow = tailPixels;
    203 }
    204 
    205 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort5551(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
    206 {
    207     unsigned componentsPerRow = pixelsPerRow * 4;
    208     unsigned tailComponents = componentsPerRow % 32;
    209     unsigned componentsSize = componentsPerRow - tailComponents;
    210 
    211     uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
    212 
    213     uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
    214     uint8x8_t immediate0x18 = vdup_n_u8(0x18);
    215     for (unsigned i = 0; i < componentsSize; i += 32) {
    216         uint8x8x4_t RGBA8 = vld4_u8(source + i);
    217 
    218         uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
    219         uint8x8_t componentG3bit = vshr_n_u8(RGBA8.val[1], 5);
    220 
    221         uint8x8_t componentG2bit = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x18), 3);
    222         uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 2);
    223         uint8x8_t componentA = vshr_n_u8(RGBA8.val[3], 7);
    224 
    225         uint8x8x2_t RGBA5551;
    226         RGBA5551.val[0] = vorr_u8(vorr_u8(componentG2bit, componentB), componentA);
    227         RGBA5551.val[1] = vorr_u8(componentR, componentG3bit);
    228         vst2_u8(dst, RGBA5551);
    229         dst += 16;
    230     }
    231 
    232     source += componentsSize;
    233     destination += componentsSize / 4;
    234     pixelsPerRow = tailComponents / 4;
    235 }
    236 
    237 ALWAYS_INLINE void unpackOneRowOfRGB565ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
    238 {
    239     unsigned tailPixels = pixelsPerRow % 8;
    240     unsigned pixelSize = pixelsPerRow - tailPixels;
    241 
    242     uint16x8_t immediate0x3f = vdupq_n_u16(0x3F);
    243     uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
    244     uint8x8_t immediate0x3 = vdup_n_u8(0x3);
    245     uint8x8_t immediate0x7 = vdup_n_u8(0x7);
    246 
    247     uint8x8_t componentA = vdup_n_u8(0xFF);
    248 
    249     for (unsigned i = 0; i < pixelSize; i += 8) {
    250         uint16x8_t eightPixels = vld1q_u16(source + i);
    251 
    252         uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
    253         uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 5), immediate0x3f));
    254         uint8x8_t componentB = vqmovn_u16(vandq_u16(eightPixels, immediate0x1f));
    255 
    256         componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
    257         componentG = vorr_u8(vshl_n_u8(componentG, 2), vand_u8(componentG, immediate0x3));
    258         componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
    259 
    260         uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
    261         vst4_u8(destination, destComponents);
    262         destination += 32;
    263     }
    264 
    265     source += pixelSize;
    266     pixelsPerRow = tailPixels;
    267 }
    268 
    269 ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort565(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
    270 {
    271     unsigned componentsPerRow = pixelsPerRow * 4;
    272     unsigned tailComponents = componentsPerRow % 32;
    273     unsigned componentsSize = componentsPerRow - tailComponents;
    274     uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
    275 
    276     uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
    277     uint8x8_t immediate0x1c = vdup_n_u8(0x1C);
    278     for (unsigned i = 0; i < componentsSize; i += 32) {
    279         uint8x8x4_t RGBA8 = vld4_u8(source + i);
    280 
    281         uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
    282         uint8x8_t componentGLeft = vshr_n_u8(RGBA8.val[1], 5);
    283         uint8x8_t componentGRight = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x1c), 3);
    284         uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 3);
    285 
    286         uint8x8x2_t RGB565;
    287         RGB565.val[0] = vorr_u8(componentGRight, componentB);
    288         RGB565.val[1] = vorr_u8(componentR, componentGLeft);
    289         vst2_u8(dst, RGB565);
    290         dst += 16;
    291     }
    292 
    293     source += componentsSize;
    294     destination += componentsSize / 4;
    295     pixelsPerRow = tailComponents / 4;
    296 }
    297 
    298 } // namespace SIMD
    299 
    300 } // namespace WebCore
    301 
    302 #endif // HAVE(ARM_NEON_INTRINSICS)
    303 
    304 #endif // GraphicsContext3DNEON_h
    305