Home | History | Annotate | Download | only in arm
      1 
      2 /* filter_neon_intrinsics.c - NEON optimised filter functions
      3  *
      4  * Copyright (c) 2014,2016 Glenn Randers-Pehrson
      5  * Written by James Yu <james.yu at linaro.org>, October 2013.
      6  * Based on filter_neon.S, written by Mans Rullgard, 2011.
      7  *
      8  * Last changed in libpng 1.6.22 [May 26, 2016]
      9  *
     10  * This code is released under the libpng license.
     11  * For conditions of distribution and use, see the disclaimer
     12  * and license in png.h
     13  */
     14 
     15 #include "../pngpriv.h"
     16 
     17 #ifdef PNG_READ_SUPPORTED
     18 
     19 /* This code requires -mfpu=neon on the command line: */
     20 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
     21 
     22 #include <arm_neon.h>
     23 
     24 /* libpng row pointers are not necessarily aligned to any particular boundary,
     25  * however this code will only work with appropriate alignment.  arm/arm_init.c
     26  * checks for this (and will not compile unless it is done). This code uses
     27  * variants of png_aligncast to avoid compiler warnings.
     28  */
     29 #define png_ptr(type,pointer) png_aligncast(type *,pointer)
     30 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
     31 
     32 /* The following relies on a variable 'temp_pointer' being declared with type
     33  * 'type'.  This is written this way just to hide the GCC strict aliasing
     34  * warning; note that the code is safe because there never is an alias between
     35  * the input and output pointers.
     36  */
     37 #define png_ldr(type,pointer)\
     38    (temp_pointer = png_ptr(type,pointer), *temp_pointer)
     39 
     40 #if PNG_ARM_NEON_OPT > 0
     41 
     42 void
     43 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
     44    png_const_bytep prev_row)
     45 {
     46    png_bytep rp = row;
     47    png_bytep rp_stop = row + row_info->rowbytes;
     48    png_const_bytep pp = prev_row;
     49 
     50    png_debug(1, "in png_read_filter_row_up_neon");
     51 
     52    for (; rp < rp_stop; rp += 16, pp += 16)
     53    {
     54       uint8x16_t qrp, qpp;
     55 
     56       qrp = vld1q_u8(rp);
     57       qpp = vld1q_u8(pp);
     58       qrp = vaddq_u8(qrp, qpp);
     59       vst1q_u8(rp, qrp);
     60    }
     61 }
     62 
     63 void
     64 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
     65    png_const_bytep prev_row)
     66 {
     67    png_bytep rp = row;
     68    png_bytep rp_stop = row + row_info->rowbytes;
     69 
     70    uint8x16_t vtmp = vld1q_u8(rp);
     71    uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
     72    uint8x8x2_t vrp = *vrpt;
     73 
     74    uint8x8x4_t vdest;
     75    vdest.val[3] = vdup_n_u8(0);
     76 
     77    png_debug(1, "in png_read_filter_row_sub3_neon");
     78 
     79    for (; rp < rp_stop;)
     80    {
     81       uint8x8_t vtmp1, vtmp2;
     82       uint32x2_t *temp_pointer;
     83 
     84       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
     85       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
     86       vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
     87       vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
     88 
     89       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
     90       vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
     91       vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
     92 
     93       vtmp = vld1q_u8(rp + 12);
     94       vrpt = png_ptr(uint8x8x2_t, &vtmp);
     95       vrp = *vrpt;
     96 
     97       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
     98       rp += 3;
     99       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
    100       rp += 3;
    101       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
    102       rp += 3;
    103       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
    104       rp += 3;
    105    }
    106 
    107    PNG_UNUSED(prev_row)
    108 }
    109 
    110 void
    111 png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
    112    png_const_bytep prev_row)
    113 {
    114    png_bytep rp = row;
    115    png_bytep rp_stop = row + row_info->rowbytes;
    116 
    117    uint8x8x4_t vdest;
    118    vdest.val[3] = vdup_n_u8(0);
    119 
    120    png_debug(1, "in png_read_filter_row_sub4_neon");
    121 
    122    for (; rp < rp_stop; rp += 16)
    123    {
    124       uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
    125       uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
    126       uint8x8x4_t vrp = *vrpt;
    127       uint32x2x4_t *temp_pointer;
    128 
    129       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
    130       vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
    131       vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
    132       vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
    133       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    134    }
    135 
    136    PNG_UNUSED(prev_row)
    137 }
    138 
    139 void
    140 png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
    141    png_const_bytep prev_row)
    142 {
    143    png_bytep rp = row;
    144    png_const_bytep pp = prev_row;
    145    png_bytep rp_stop = row + row_info->rowbytes;
    146 
    147    uint8x16_t vtmp;
    148    uint8x8x2_t *vrpt;
    149    uint8x8x2_t vrp;
    150    uint8x8x4_t vdest;
    151    vdest.val[3] = vdup_n_u8(0);
    152 
    153    vtmp = vld1q_u8(rp);
    154    vrpt = png_ptr(uint8x8x2_t,&vtmp);
    155    vrp = *vrpt;
    156 
    157    png_debug(1, "in png_read_filter_row_avg3_neon");
    158 
    159    for (; rp < rp_stop; pp += 12)
    160    {
    161       uint8x8_t vtmp1, vtmp2, vtmp3;
    162 
    163       uint8x8x2_t *vppt;
    164       uint8x8x2_t vpp;
    165 
    166       uint32x2_t *temp_pointer;
    167 
    168       vtmp = vld1q_u8(pp);
    169       vppt = png_ptr(uint8x8x2_t,&vtmp);
    170       vpp = *vppt;
    171 
    172       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
    173       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
    174       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    175 
    176       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
    177       vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
    178       vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
    179       vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
    180 
    181       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
    182       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
    183 
    184       vtmp = vld1q_u8(rp + 12);
    185       vrpt = png_ptr(uint8x8x2_t,&vtmp);
    186       vrp = *vrpt;
    187 
    188       vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
    189       vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
    190 
    191       vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
    192 
    193       vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
    194       vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
    195 
    196       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
    197       rp += 3;
    198       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
    199       rp += 3;
    200       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
    201       rp += 3;
    202       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
    203       rp += 3;
    204    }
    205 }
    206 
    207 void
    208 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
    209    png_const_bytep prev_row)
    210 {
    211    png_bytep rp = row;
    212    png_bytep rp_stop = row + row_info->rowbytes;
    213    png_const_bytep pp = prev_row;
    214 
    215    uint8x8x4_t vdest;
    216    vdest.val[3] = vdup_n_u8(0);
    217 
    218    png_debug(1, "in png_read_filter_row_avg4_neon");
    219 
    220    for (; rp < rp_stop; rp += 16, pp += 16)
    221    {
    222       uint32x2x4_t vtmp;
    223       uint8x8x4_t *vrpt, *vppt;
    224       uint8x8x4_t vrp, vpp;
    225       uint32x2x4_t *temp_pointer;
    226 
    227       vtmp = vld4_u32(png_ptr(uint32_t,rp));
    228       vrpt = png_ptr(uint8x8x4_t,&vtmp);
    229       vrp = *vrpt;
    230       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
    231       vppt = png_ptr(uint8x8x4_t,&vtmp);
    232       vpp = *vppt;
    233 
    234       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
    235       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    236       vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
    237       vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
    238       vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
    239       vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
    240       vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
    241       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
    242 
    243       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    244    }
    245 }
    246 
    247 static uint8x8_t
    248 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
    249 {
    250    uint8x8_t d, e;
    251    uint16x8_t p1, pa, pb, pc;
    252 
    253    p1 = vaddl_u8(a, b); /* a + b */
    254    pc = vaddl_u8(c, c); /* c * 2 */
    255    pa = vabdl_u8(b, c); /* pa */
    256    pb = vabdl_u8(a, c); /* pb */
    257    pc = vabdq_u16(p1, pc); /* pc */
    258 
    259    p1 = vcleq_u16(pa, pb); /* pa <= pb */
    260    pa = vcleq_u16(pa, pc); /* pa <= pc */
    261    pb = vcleq_u16(pb, pc); /* pb <= pc */
    262 
    263    p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
    264 
    265    d = vmovn_u16(pb);
    266    e = vmovn_u16(p1);
    267 
    268    d = vbsl_u8(d, b, c);
    269    e = vbsl_u8(e, a, d);
    270 
    271    return e;
    272 }
    273 
    274 void
    275 png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
    276    png_const_bytep prev_row)
    277 {
    278    png_bytep rp = row;
    279    png_const_bytep pp = prev_row;
    280    png_bytep rp_stop = row + row_info->rowbytes;
    281 
    282    uint8x16_t vtmp;
    283    uint8x8x2_t *vrpt;
    284    uint8x8x2_t vrp;
    285    uint8x8_t vlast = vdup_n_u8(0);
    286    uint8x8x4_t vdest;
    287    vdest.val[3] = vdup_n_u8(0);
    288 
    289    vtmp = vld1q_u8(rp);
    290    vrpt = png_ptr(uint8x8x2_t,&vtmp);
    291    vrp = *vrpt;
    292 
    293    png_debug(1, "in png_read_filter_row_paeth3_neon");
    294 
    295    for (; rp < rp_stop; pp += 12)
    296    {
    297       uint8x8x2_t *vppt;
    298       uint8x8x2_t vpp;
    299       uint8x8_t vtmp1, vtmp2, vtmp3;
    300       uint32x2_t *temp_pointer;
    301 
    302       vtmp = vld1q_u8(pp);
    303       vppt = png_ptr(uint8x8x2_t,&vtmp);
    304       vpp = *vppt;
    305 
    306       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
    307       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    308 
    309       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
    310       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
    311       vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
    312       vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
    313 
    314       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
    315       vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
    316       vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
    317       vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
    318 
    319       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
    320       vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
    321 
    322       vtmp = vld1q_u8(rp + 12);
    323       vrpt = png_ptr(uint8x8x2_t,&vtmp);
    324       vrp = *vrpt;
    325 
    326       vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
    327       vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
    328 
    329       vlast = vtmp2;
    330 
    331       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
    332       rp += 3;
    333       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
    334       rp += 3;
    335       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
    336       rp += 3;
    337       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
    338       rp += 3;
    339    }
    340 }
    341 
    342 void
    343 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
    344    png_const_bytep prev_row)
    345 {
    346    png_bytep rp = row;
    347    png_bytep rp_stop = row + row_info->rowbytes;
    348    png_const_bytep pp = prev_row;
    349 
    350    uint8x8_t vlast = vdup_n_u8(0);
    351    uint8x8x4_t vdest;
    352    vdest.val[3] = vdup_n_u8(0);
    353 
    354    png_debug(1, "in png_read_filter_row_paeth4_neon");
    355 
    356    for (; rp < rp_stop; rp += 16, pp += 16)
    357    {
    358       uint32x2x4_t vtmp;
    359       uint8x8x4_t *vrpt, *vppt;
    360       uint8x8x4_t vrp, vpp;
    361       uint32x2x4_t *temp_pointer;
    362 
    363       vtmp = vld4_u32(png_ptr(uint32_t,rp));
    364       vrpt = png_ptr(uint8x8x4_t,&vtmp);
    365       vrp = *vrpt;
    366       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
    367       vppt = png_ptr(uint8x8x4_t,&vtmp);
    368       vpp = *vppt;
    369 
    370       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
    371       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    372       vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
    373       vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
    374       vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
    375       vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
    376       vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
    377       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
    378 
    379       vlast = vpp.val[3];
    380 
    381       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    382    }
    383 }
    384 
    385 #endif /* PNG_ARM_NEON_OPT > 0 */
    386 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
    387 #endif /* READ */
    388