Home | History | Annotate | Download | only in arm
      1 
      2 /* filter_neon_intrinsics.c - NEON optimised filter functions
      3  *
      4  * Copyright (c) 2014 Glenn Randers-Pehrson
      5  * Written by James Yu <james.yu at linaro.org>, October 2013.
      6  * Based on filter_neon.S, written by Mans Rullgard, 2011.
      7  *
      8  * Last changed in libpng 1.6.16 [December 22, 2014]
      9  *
     10  * This code is released under the libpng license.
     11  * For conditions of distribution and use, see the disclaimer
     12  * and license in png.h
     13  */
     14 
     15 #include "../pngpriv.h"
     16 
     17 #ifdef PNG_READ_SUPPORTED
     18 
     19 /* This code requires -mfpu=neon on the command line: */
     20 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
     21 
     22 #include <arm_neon.h>
     23 
     24 /* libpng row pointers are not necessarily aligned to any particular boundary,
     25  * however this code will only work with appropriate alignment.  arm/arm_init.c
     26  * checks for this (and will not compile unless it is done). This code uses
     27  * variants of png_aligncast to avoid compiler warnings.
     28  */
     29 #define png_ptr(type,pointer) png_aligncast(type *,pointer)
     30 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
     31 
     32 /* The following relies on a variable 'temp_pointer' being declared with type
     33  * 'type'.  This is written this way just to hide the GCC strict aliasing
     34  * warning; note that the code is safe because there never is an alias between
     35  * the input and output pointers.
     36  */
     37 #define png_ldr(type,pointer)\
     38    (temp_pointer = png_ptr(type,pointer), *temp_pointer)
     39 
     40 #if PNG_ARM_NEON_OPT > 0
     41 
     42 void
     43 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
     44    png_const_bytep prev_row)
     45 {
     46    png_bytep rp = row;
     47    png_bytep rp_stop = row + row_info->rowbytes;
     48    png_const_bytep pp = prev_row;
     49 
     50    for (; rp < rp_stop; rp += 16, pp += 16)
     51    {
     52       uint8x16_t qrp, qpp;
     53 
     54       qrp = vld1q_u8(rp);
     55       qpp = vld1q_u8(pp);
     56       qrp = vaddq_u8(qrp, qpp);
     57       vst1q_u8(rp, qrp);
     58    }
     59 }
     60 
     61 void
     62 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
     63    png_const_bytep prev_row)
     64 {
     65    png_bytep rp = row;
     66    png_bytep rp_stop = row + row_info->rowbytes;
     67 
     68    uint8x16_t vtmp = vld1q_u8(rp);
     69    uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
     70    uint8x8x2_t vrp = *vrpt;
     71 
     72    uint8x8x4_t vdest;
     73    vdest.val[3] = vdup_n_u8(0);
     74 
     75    for (; rp < rp_stop;)
     76    {
     77       uint8x8_t vtmp1, vtmp2;
     78       uint32x2_t *temp_pointer;
     79 
     80       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
     81       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
     82       vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
     83       vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
     84 
     85       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
     86       vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
     87       vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
     88 
     89       vtmp = vld1q_u8(rp + 12);
     90       vrpt = png_ptr(uint8x8x2_t, &vtmp);
     91       vrp = *vrpt;
     92 
     93       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
     94       rp += 3;
     95       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
     96       rp += 3;
     97       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
     98       rp += 3;
     99       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
    100       rp += 3;
    101    }
    102 
    103    PNG_UNUSED(prev_row)
    104 }
    105 
    106 void
    107 png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
    108    png_const_bytep prev_row)
    109 {
    110    png_bytep rp = row;
    111    png_bytep rp_stop = row + row_info->rowbytes;
    112 
    113    uint8x8x4_t vdest;
    114    vdest.val[3] = vdup_n_u8(0);
    115 
    116    for (; rp < rp_stop; rp += 16)
    117    {
    118       uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
    119       uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
    120       uint8x8x4_t vrp = *vrpt;
    121       uint32x2x4_t *temp_pointer;
    122 
    123       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
    124       vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
    125       vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
    126       vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
    127       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    128    }
    129 
    130    PNG_UNUSED(prev_row)
    131 }
    132 
    133 void
    134 png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
    135    png_const_bytep prev_row)
    136 {
    137    png_bytep rp = row;
    138    png_const_bytep pp = prev_row;
    139    png_bytep rp_stop = row + row_info->rowbytes;
    140 
    141    uint8x16_t vtmp;
    142    uint8x8x2_t *vrpt;
    143    uint8x8x2_t vrp;
    144    uint8x8x4_t vdest;
    145    vdest.val[3] = vdup_n_u8(0);
    146 
    147    vtmp = vld1q_u8(rp);
    148    vrpt = png_ptr(uint8x8x2_t,&vtmp);
    149    vrp = *vrpt;
    150 
    151    for (; rp < rp_stop; pp += 12)
    152    {
    153       uint8x8_t vtmp1, vtmp2, vtmp3;
    154 
    155       uint8x8x2_t *vppt;
    156       uint8x8x2_t vpp;
    157 
    158       uint32x2_t *temp_pointer;
    159 
    160       vtmp = vld1q_u8(pp);
    161       vppt = png_ptr(uint8x8x2_t,&vtmp);
    162       vpp = *vppt;
    163 
    164       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
    165       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
    166       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    167 
    168       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
    169       vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
    170       vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
    171       vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
    172 
    173       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
    174       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
    175 
    176       vtmp = vld1q_u8(rp + 12);
    177       vrpt = png_ptr(uint8x8x2_t,&vtmp);
    178       vrp = *vrpt;
    179 
    180       vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
    181       vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
    182 
    183       vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
    184 
    185       vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
    186       vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
    187 
    188       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
    189       rp += 3;
    190       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
    191       rp += 3;
    192       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
    193       rp += 3;
    194       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
    195       rp += 3;
    196    }
    197 }
    198 
    199 void
    200 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
    201    png_const_bytep prev_row)
    202 {
    203    png_bytep rp = row;
    204    png_bytep rp_stop = row + row_info->rowbytes;
    205    png_const_bytep pp = prev_row;
    206 
    207    uint8x8x4_t vdest;
    208    vdest.val[3] = vdup_n_u8(0);
    209 
    210    for (; rp < rp_stop; rp += 16, pp += 16)
    211    {
    212       uint32x2x4_t vtmp;
    213       uint8x8x4_t *vrpt, *vppt;
    214       uint8x8x4_t vrp, vpp;
    215       uint32x2x4_t *temp_pointer;
    216 
    217       vtmp = vld4_u32(png_ptr(uint32_t,rp));
    218       vrpt = png_ptr(uint8x8x4_t,&vtmp);
    219       vrp = *vrpt;
    220       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
    221       vppt = png_ptr(uint8x8x4_t,&vtmp);
    222       vpp = *vppt;
    223 
    224       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
    225       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    226       vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
    227       vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
    228       vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
    229       vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
    230       vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
    231       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
    232 
    233       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    234    }
    235 }
    236 
    237 static uint8x8_t
    238 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
    239 {
    240    uint8x8_t d, e;
    241    uint16x8_t p1, pa, pb, pc;
    242 
    243    p1 = vaddl_u8(a, b); /* a + b */
    244    pc = vaddl_u8(c, c); /* c * 2 */
    245    pa = vabdl_u8(b, c); /* pa */
    246    pb = vabdl_u8(a, c); /* pb */
    247    pc = vabdq_u16(p1, pc); /* pc */
    248 
    249    p1 = vcleq_u16(pa, pb); /* pa <= pb */
    250    pa = vcleq_u16(pa, pc); /* pa <= pc */
    251    pb = vcleq_u16(pb, pc); /* pb <= pc */
    252 
    253    p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
    254 
    255    d = vmovn_u16(pb);
    256    e = vmovn_u16(p1);
    257 
    258    d = vbsl_u8(d, b, c);
    259    e = vbsl_u8(e, a, d);
    260 
    261    return e;
    262 }
    263 
    264 void
    265 png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
    266    png_const_bytep prev_row)
    267 {
    268    png_bytep rp = row;
    269    png_const_bytep pp = prev_row;
    270    png_bytep rp_stop = row + row_info->rowbytes;
    271 
    272    uint8x16_t vtmp;
    273    uint8x8x2_t *vrpt;
    274    uint8x8x2_t vrp;
    275    uint8x8_t vlast = vdup_n_u8(0);
    276    uint8x8x4_t vdest;
    277    vdest.val[3] = vdup_n_u8(0);
    278 
    279    vtmp = vld1q_u8(rp);
    280    vrpt = png_ptr(uint8x8x2_t,&vtmp);
    281    vrp = *vrpt;
    282 
    283    for (; rp < rp_stop; pp += 12)
    284    {
    285       uint8x8x2_t *vppt;
    286       uint8x8x2_t vpp;
    287       uint8x8_t vtmp1, vtmp2, vtmp3;
    288       uint32x2_t *temp_pointer;
    289 
    290       vtmp = vld1q_u8(pp);
    291       vppt = png_ptr(uint8x8x2_t,&vtmp);
    292       vpp = *vppt;
    293 
    294       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
    295       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    296 
    297       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
    298       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
    299       vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
    300       vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
    301 
    302       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
    303       vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
    304       vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
    305       vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
    306 
    307       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
    308       vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
    309 
    310       vtmp = vld1q_u8(rp + 12);
    311       vrpt = png_ptr(uint8x8x2_t,&vtmp);
    312       vrp = *vrpt;
    313 
    314       vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
    315       vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
    316 
    317       vlast = vtmp2;
    318 
    319       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
    320       rp += 3;
    321       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
    322       rp += 3;
    323       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
    324       rp += 3;
    325       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
    326       rp += 3;
    327    }
    328 }
    329 
    330 void
    331 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
    332    png_const_bytep prev_row)
    333 {
    334    png_bytep rp = row;
    335    png_bytep rp_stop = row + row_info->rowbytes;
    336    png_const_bytep pp = prev_row;
    337 
    338    uint8x8_t vlast = vdup_n_u8(0);
    339    uint8x8x4_t vdest;
    340    vdest.val[3] = vdup_n_u8(0);
    341 
    342    for (; rp < rp_stop; rp += 16, pp += 16)
    343    {
    344       uint32x2x4_t vtmp;
    345       uint8x8x4_t *vrpt, *vppt;
    346       uint8x8x4_t vrp, vpp;
    347       uint32x2x4_t *temp_pointer;
    348 
    349       vtmp = vld4_u32(png_ptr(uint32_t,rp));
    350       vrpt = png_ptr(uint8x8x4_t,&vtmp);
    351       vrp = *vrpt;
    352       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
    353       vppt = png_ptr(uint8x8x4_t,&vtmp);
    354       vpp = *vppt;
    355 
    356       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
    357       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    358       vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
    359       vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
    360       vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
    361       vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
    362       vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
    363       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
    364 
    365       vlast = vpp.val[3];
    366 
    367       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    368    }
    369 }
    370 
    371 #endif /* PNG_ARM_NEON_OPT > 0 */
    372 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
    373 #endif /* READ */
    374