Home | History | Annotate | Download | only in arm
      1 
      2 /* filter_neon_intrinsics.c - NEON optimised filter functions
      3  *
      4  * Copyright (c) 2013 Glenn Randers-Pehrson
      5  * Written by James Yu <james.yu at linaro.org>, October 2013.
      6  * Based on filter_neon.S, written by Mans Rullgard, 2011.
      7  *
      8  * Last changed in libpng 1.6.8 [December 19, 2013]
      9  *
     10  * This code is released under the libpng license.
     11  * For conditions of distribution and use, see the disclaimer
     12  * and license in png.h
     13  */
     14 
     15 #include "../pngpriv.h"
     16 
     17 /* This code requires -mfpu=neon on the command line: */
     18 #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
     19 
     20 #include <arm_neon.h>
     21 
     22 /* libpng row pointers are not necessarily aligned to any particular boundary,
     23  * however this code will only work with appropriate alignment.  arm/arm_init.c
     24  * checks for this (and will not compile unless it is done). This code uses
     25  * variants of png_aligncast to avoid compiler warnings.
     26  */
     27 #define png_ptr(type,pointer) png_aligncast(type *,pointer)
     28 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
     29 
     30 /* The following relies on a variable 'temp_pointer' being declared with type
     31  * 'type'.  This is written this way just to hide the GCC strict aliasing
     32  * warning; note that the code is safe because there never is an alias between
     33  * the input and output pointers.
     34  */
     35 #define png_ldr(type,pointer)\
     36    (temp_pointer = png_ptr(type,pointer), *temp_pointer)
     37 
     38 #ifdef PNG_READ_SUPPORTED
     39 #if PNG_ARM_NEON_OPT > 0
     40 
     41 void
     42 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
     43    png_const_bytep prev_row)
     44 {
     45    png_bytep rp = row;
     46    png_bytep rp_stop = row + row_info->rowbytes;
     47    png_const_bytep pp = prev_row;
     48 
     49    for (; rp < rp_stop; rp += 16, pp += 16)
     50    {
     51       uint8x16_t qrp, qpp;
     52 
     53       qrp = vld1q_u8(rp);
     54       qpp = vld1q_u8(pp);
     55       qrp = vaddq_u8(qrp, qpp);
     56       vst1q_u8(rp, qrp);
     57    }
     58 }
     59 
     60 void
     61 png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
     62    png_const_bytep prev_row)
     63 {
     64    png_bytep rp = row;
     65    png_bytep rp_stop = row + row_info->rowbytes;
     66 
     67    uint8x16_t vtmp = vld1q_u8(rp);
     68    uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
     69    uint8x8x2_t vrp = *vrpt;
     70 
     71    uint8x8x4_t vdest;
     72    vdest.val[3] = vdup_n_u8(0);
     73 
     74    for (; rp < rp_stop;)
     75    {
     76       uint8x8_t vtmp1, vtmp2;
     77       uint32x2_t *temp_pointer;
     78 
     79       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
     80       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
     81       vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
     82       vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
     83 
     84       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
     85       vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
     86       vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
     87 
     88       vtmp = vld1q_u8(rp + 12);
     89       vrpt = png_ptr(uint8x8x2_t, &vtmp);
     90       vrp = *vrpt;
     91 
     92       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
     93       rp += 3;
     94       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
     95       rp += 3;
     96       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
     97       rp += 3;
     98       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
     99       rp += 3;
    100    }
    101 
    102    PNG_UNUSED(prev_row)
    103 }
    104 
    105 void
    106 png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
    107    png_const_bytep prev_row)
    108 {
    109    png_bytep rp = row;
    110    png_bytep rp_stop = row + row_info->rowbytes;
    111 
    112    uint8x8x4_t vdest;
    113    vdest.val[3] = vdup_n_u8(0);
    114 
    115    for (; rp < rp_stop; rp += 16)
    116    {
    117       uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
    118       uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
    119       uint8x8x4_t vrp = *vrpt;
    120       uint32x2x4_t *temp_pointer;
    121 
    122       vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
    123       vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
    124       vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
    125       vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
    126       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    127    }
    128 
    129    PNG_UNUSED(prev_row)
    130 }
    131 
    132 void
    133 png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
    134    png_const_bytep prev_row)
    135 {
    136    png_bytep rp = row;
    137    png_const_bytep pp = prev_row;
    138    png_bytep rp_stop = row + row_info->rowbytes;
    139 
    140    uint8x16_t vtmp;
    141    uint8x8x2_t *vrpt;
    142    uint8x8x2_t vrp;
    143    uint8x8x4_t vdest;
    144    vdest.val[3] = vdup_n_u8(0);
    145 
    146    vtmp = vld1q_u8(rp);
    147    vrpt = png_ptr(uint8x8x2_t,&vtmp);
    148    vrp = *vrpt;
    149 
    150    for (; rp < rp_stop; pp += 12)
    151    {
    152       uint8x8_t vtmp1, vtmp2, vtmp3;
    153 
    154       uint8x8x2_t *vppt;
    155       uint8x8x2_t vpp;
    156 
    157       uint32x2_t *temp_pointer;
    158 
    159       vtmp = vld1q_u8(pp);
    160       vppt = png_ptr(uint8x8x2_t,&vtmp);
    161       vpp = *vppt;
    162 
    163       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
    164       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
    165       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    166 
    167       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
    168       vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
    169       vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
    170       vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
    171 
    172       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
    173       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
    174 
    175       vtmp = vld1q_u8(rp + 12);
    176       vrpt = png_ptr(uint8x8x2_t,&vtmp);
    177       vrp = *vrpt;
    178 
    179       vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
    180       vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
    181 
    182       vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
    183 
    184       vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
    185       vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
    186 
    187       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
    188       rp += 3;
    189       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
    190       rp += 3;
    191       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
    192       rp += 3;
    193       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
    194       rp += 3;
    195    }
    196 }
    197 
    198 void
    199 png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
    200    png_const_bytep prev_row)
    201 {
    202    png_bytep rp = row;
    203    png_bytep rp_stop = row + row_info->rowbytes;
    204    png_const_bytep pp = prev_row;
    205 
    206    uint8x8x4_t vdest;
    207    vdest.val[3] = vdup_n_u8(0);
    208 
    209    for (; rp < rp_stop; rp += 16, pp += 16)
    210    {
    211       uint32x2x4_t vtmp;
    212       uint8x8x4_t *vrpt, *vppt;
    213       uint8x8x4_t vrp, vpp;
    214       uint32x2x4_t *temp_pointer;
    215 
    216       vtmp = vld4_u32(png_ptr(uint32_t,rp));
    217       vrpt = png_ptr(uint8x8x4_t,&vtmp);
    218       vrp = *vrpt;
    219       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
    220       vppt = png_ptr(uint8x8x4_t,&vtmp);
    221       vpp = *vppt;
    222 
    223       vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
    224       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    225       vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
    226       vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
    227       vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
    228       vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
    229       vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
    230       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
    231 
    232       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    233    }
    234 }
    235 
    236 static uint8x8_t
    237 paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
    238 {
    239    uint8x8_t d, e;
    240    uint16x8_t p1, pa, pb, pc;
    241 
    242    p1 = vaddl_u8(a, b); /* a + b */
    243    pc = vaddl_u8(c, c); /* c * 2 */
    244    pa = vabdl_u8(b, c); /* pa */
    245    pb = vabdl_u8(a, c); /* pb */
    246    pc = vabdq_u16(p1, pc); /* pc */
    247 
    248    p1 = vcleq_u16(pa, pb); /* pa <= pb */
    249    pa = vcleq_u16(pa, pc); /* pa <= pc */
    250    pb = vcleq_u16(pb, pc); /* pb <= pc */
    251 
    252    p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
    253 
    254    d = vmovn_u16(pb);
    255    e = vmovn_u16(p1);
    256 
    257    d = vbsl_u8(d, b, c);
    258    e = vbsl_u8(e, a, d);
    259 
    260    return e;
    261 }
    262 
    263 void
    264 png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
    265    png_const_bytep prev_row)
    266 {
    267    png_bytep rp = row;
    268    png_const_bytep pp = prev_row;
    269    png_bytep rp_stop = row + row_info->rowbytes;
    270 
    271    uint8x16_t vtmp;
    272    uint8x8x2_t *vrpt;
    273    uint8x8x2_t vrp;
    274    uint8x8_t vlast = vdup_n_u8(0);
    275    uint8x8x4_t vdest;
    276    vdest.val[3] = vdup_n_u8(0);
    277 
    278    vtmp = vld1q_u8(rp);
    279    vrpt = png_ptr(uint8x8x2_t,&vtmp);
    280    vrp = *vrpt;
    281 
    282    for (; rp < rp_stop; pp += 12)
    283    {
    284       uint8x8x2_t *vppt;
    285       uint8x8x2_t vpp;
    286       uint8x8_t vtmp1, vtmp2, vtmp3;
    287       uint32x2_t *temp_pointer;
    288 
    289       vtmp = vld1q_u8(pp);
    290       vppt = png_ptr(uint8x8x2_t,&vtmp);
    291       vpp = *vppt;
    292 
    293       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
    294       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    295 
    296       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
    297       vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
    298       vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
    299       vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
    300 
    301       vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
    302       vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
    303       vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
    304       vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
    305 
    306       vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
    307       vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
    308 
    309       vtmp = vld1q_u8(rp + 12);
    310       vrpt = png_ptr(uint8x8x2_t,&vtmp);
    311       vrp = *vrpt;
    312 
    313       vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
    314       vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
    315 
    316       vlast = vtmp2;
    317 
    318       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
    319       rp += 3;
    320       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
    321       rp += 3;
    322       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
    323       rp += 3;
    324       vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
    325       rp += 3;
    326    }
    327 }
    328 
    329 void
    330 png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
    331    png_const_bytep prev_row)
    332 {
    333    png_bytep rp = row;
    334    png_bytep rp_stop = row + row_info->rowbytes;
    335    png_const_bytep pp = prev_row;
    336 
    337    uint8x8_t vlast = vdup_n_u8(0);
    338    uint8x8x4_t vdest;
    339    vdest.val[3] = vdup_n_u8(0);
    340 
    341    for (; rp < rp_stop; rp += 16, pp += 16)
    342    {
    343       uint32x2x4_t vtmp;
    344       uint8x8x4_t *vrpt, *vppt;
    345       uint8x8x4_t vrp, vpp;
    346       uint32x2x4_t *temp_pointer;
    347 
    348       vtmp = vld4_u32(png_ptr(uint32_t,rp));
    349       vrpt = png_ptr(uint8x8x4_t,&vtmp);
    350       vrp = *vrpt;
    351       vtmp = vld4_u32(png_ptrc(uint32_t,pp));
    352       vppt = png_ptr(uint8x8x4_t,&vtmp);
    353       vpp = *vppt;
    354 
    355       vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
    356       vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
    357       vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
    358       vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
    359       vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
    360       vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
    361       vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
    362       vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
    363 
    364       vlast = vpp.val[3];
    365 
    366       vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
    367    }
    368 }
    369 
    370 #endif /* PNG_ARM_NEON_OPT > 0 */
    371 #endif /* PNG_READ_SUPPORTED */
    372 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
    373